Quick Start

Michael Koohafkan

2021-06-09

This document gets you up and running with hdfqlr, an R interface to HDFql. In order to use this package, you will need to download HDFql for your system.

If you are going to be using HDFql regularly, it's a good idea to set a default HDFql directory for use with hdfqlr. You can do this by either - Setting the R options hdfqlr.dir in your .Rprofile file. - Setting the system environment variable HDFQL-DIR. If either of these settings exist, the HDFql library will be loaded on package start.

library(hdfqlr)

Otherwise, you can load the HDFql library after package start with hql_load():

hql_load('path/to/hdfql-x.x.x')

If you are on a Linux system, you will need to update the environment variable LD_LIBRARY_PATH to include the HDFql directories prior to using the package:

export LD_LIBRARY_PATH=<hdfql_r_wrapper_directory>:<hdfql_lib_directory>:$LD_LIBRARY_PATH

The hdfqlr package relies on the R wrapper provided by HDFql. Functions exported by the package are prefixed with hql_ to make it easy to differentiate them from the functions provided by the wrapper, which are prefixed with HDFQL_ (for constants) or hdfql_ (for functions). The contents of the HDFql wrapper are imported into the environment hql$wrapper. If you need to directly use the HDFql wrapper functions in an interactive session, you can access them using with or attach the environment to the search path. Any existing scripts written for use with the HDFql wrapper can therefore be run by attaching the hql$wrapper environment prior to running the script, or by using with(hql$wrapper, ...).

# using with
with(hql$wrapper,
    HDFQL_VERSION
)
## [1] "2.1.0"
# or attach the environment
attach(hql$wrapper)
## The following objects are masked from hql$wrapper (pos = 4):
## 
##     HDFQL_ARRAY, HDFQL_ASCII, HDFQL_ATTRIBUTE, HDFQL_BIG_ENDIAN,
##     HDFQL_BIGINT, HDFQL_BITFIELD, HDFQL_CHAR, HDFQL_CHUNKED,
##     HDFQL_COMPACT, HDFQL_COMPOUND, HDFQL_CONTIGUOUS, hdfql_cursor,
##     hdfql_cursor_, hdfql_cursor_absolute, hdfql_cursor_clear,
##     hdfql_cursor_clone, hdfql_cursor_first, hdfql_cursor_get_bigint,
##     hdfql_cursor_get_char, hdfql_cursor_get_count,
##     hdfql_cursor_get_data_type, hdfql_cursor_get_double,
##     hdfql_cursor_get_float, hdfql_cursor_get_int,
##     hdfql_cursor_get_position, hdfql_cursor_get_size,
##     hdfql_cursor_get_smallint, hdfql_cursor_get_tinyint,
##     hdfql_cursor_get_unsigned_bigint, hdfql_cursor_get_unsigned_int,
##     hdfql_cursor_get_unsigned_smallint,
##     hdfql_cursor_get_unsigned_tinyint, hdfql_cursor_initialize,
##     hdfql_cursor_last, hdfql_cursor_next, hdfql_cursor_previous,
##     hdfql_cursor_relative, hdfql_cursor_use, hdfql_cursor_use_default,
##     HDFQL_DATASET, HDFQL_DIRECTORY, HDFQL_DISABLED, HDFQL_DOUBLE,
##     HDFQL_EARLIEST, HDFQL_EARLY, HDFQL_ENABLED, HDFQL_ENUMERATION,
##     HDFQL_ERROR_AFTER_LAST, HDFQL_ERROR_ALREADY_EXISTS,
##     HDFQL_ERROR_BEFORE_FIRST, HDFQL_ERROR_DANGLING_LINK,
##     HDFQL_ERROR_EMPTY, HDFQL_ERROR_FULL, hdfql_error_get_line,
##     hdfql_error_get_message, hdfql_error_get_position,
##     HDFQL_ERROR_INVALID_FILE, HDFQL_ERROR_INVALID_REGULAR_EXPRESSION,
##     HDFQL_ERROR_NO_ACCESS, HDFQL_ERROR_NO_ADDRESS,
##     HDFQL_ERROR_NOT_ENOUGH_MEMORY, HDFQL_ERROR_NOT_ENOUGH_SPACE,
##     HDFQL_ERROR_NOT_FOUND, HDFQL_ERROR_NOT_OPEN,
##     HDFQL_ERROR_NOT_REGISTERED, HDFQL_ERROR_NOT_SUPPORTED,
##     HDFQL_ERROR_OUTSIDE_LIMIT, HDFQL_ERROR_PARSE,
##     HDFQL_ERROR_UNEXPECTED_DATA_TYPE,
##     HDFQL_ERROR_UNEXPECTED_STORAGE_TYPE, HDFQL_ERROR_UNEXPECTED_TYPE,
##     HDFQL_ERROR_UNKNOWN, hdfql_execute, hdfql_execute_get_status,
##     HDFQL_EXTERNAL_LINK, HDFQL_FILE, HDFQL_FILL_DEFAULT,
##     HDFQL_FILL_UNDEFINED, HDFQL_FILL_USER_DEFINED, HDFQL_FLOAT,
##     hdfql_get_canonical_path, HDFQL_GLOBAL, HDFQL_GROUP,
##     HDFQL_INCREMENTAL, HDFQL_INDEXED, HDFQL_INT, HDFQL_LATE,
##     HDFQL_LATEST, HDFQL_LITTLE_ENDIAN, HDFQL_LOCAL, hdfql_mpi_get_rank,
##     hdfql_mpi_get_size, HDFQL_NO, HDFQL_OPAQUE, HDFQL_REFERENCE,
##     hdfql_shared_library, HDFQL_SMALLINT, HDFQL_SOFT_LINK,
##     hdfql_subcursor_absolute, hdfql_subcursor_first,
##     hdfql_subcursor_get_bigint, hdfql_subcursor_get_char,
##     hdfql_subcursor_get_count, hdfql_subcursor_get_double,
##     hdfql_subcursor_get_float, hdfql_subcursor_get_int,
##     hdfql_subcursor_get_position, hdfql_subcursor_get_size,
##     hdfql_subcursor_get_smallint, hdfql_subcursor_get_tinyint,
##     hdfql_subcursor_get_unsigned_bigint,
##     hdfql_subcursor_get_unsigned_int,
##     hdfql_subcursor_get_unsigned_smallint,
##     hdfql_subcursor_get_unsigned_tinyint, hdfql_subcursor_last,
##     hdfql_subcursor_next, hdfql_subcursor_previous,
##     hdfql_subcursor_relative, HDFQL_SUCCESS, HDFQL_TINYINT,
##     HDFQL_TRACKED, HDFQL_UNDEFINED, HDFQL_UNLIMITED,
##     HDFQL_UNSIGNED_BIGINT, HDFQL_UNSIGNED_INT, HDFQL_UNSIGNED_SMALLINT,
##     HDFQL_UNSIGNED_TINYINT, HDFQL_UNSIGNED_VARBIGINT,
##     HDFQL_UNSIGNED_VARINT, HDFQL_UNSIGNED_VARSMALLINT,
##     HDFQL_UNSIGNED_VARTINYINT, HDFQL_UTF8, HDFQL_VARBIGINT,
##     HDFQL_VARCHAR, HDFQL_VARDOUBLE, HDFQL_VARFLOAT,
##     hdfql_variable_get_count, hdfql_variable_get_data_type,
##     hdfql_variable_get_dimension, hdfql_variable_get_dimension_count,
##     hdfql_variable_get_number, hdfql_variable_get_size,
##     hdfql_variable_register, hdfql_variable_transient_register,
##     hdfql_variable_unregister, hdfql_variable_unregister_all,
##     HDFQL_VARINT, HDFQL_VARSMALLINT, HDFQL_VARTINYINT, HDFQL_VERSION,
##     HDFQL_VERSION_18, HDFQL_YES
HDFQL_VERSION
## [1] "2.1.0"
detach(hql$wrapper)

The hdfqlr package is primarily designed for simple read and write use cases, i.e. inspecting, reading and writing HDF datasets and attributes. In order to access HDF files, they must be opened or "used":

file = tempfile(fileext = ".h5")
hql_create_file(file)
hql_use_file(file)

Creation of datasets and attributes is accomplished with the hql_write_* family of functions. Groups and sub-groups are created on the fly when writing datasets or attributes, but can also be explicitly created using hql_create_group(). The following example writes a matrix to the file. The dataset is then read back in and compared to the original R object. The function hql_flush is used to ensure that any buffered data is written to the HDF file prior to reading the data back into R.

values = matrix(rnorm(1000), nrow = 50)
hql_write_dataset(values, "dataset0")
hql_flush()
all.equal(values, hql_read_dataset("dataset0"),
  check.attributes = FALSE)
## [1] TRUE

Any attributes of the R object (other than dim) are also written to the dataset. To write an attributes (or list of attributes) to the root of the HDF file or to a group, use hql_write_attribute (or hql_write_all_attributes). When writing character datasets, the maximum string length is used for all elements of the dataset.

char.values = month.name
attr(char.values, "abb") = month.abb
hql_write_dataset(char.values, "group1/dataset1")
hql_flush()
hql_read_dataset("group1/dataset1", include.attributes = TRUE)
##  [1] "January  " "February " "March    " "April    " "May      " "June     "
##  [7] "July     " "August   " "September" "October  " "November " "December "
## attr(,"abb")
##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"

The package does not currently support editing existing datasets or attributes; this is instead accomplished by reading the dataset or attribute to an R object, modifying it, and writing it back to the HDF file using the argument overwrite = TRUE.

The package also provides options for listing the contents of an HDF file via the functions hql_list_groups, hql_list_datasets, and hql_list_attributes. Both hql_list_groupsand hql_list_datasets provide support for recursively listing sub-groups and sub-datasets.

hql_list_groups()
## [1] "group1"
hql_list_datasets()
## [1] "dataset0"
hql_list_datasets(recursive = TRUE)
## [1] "dataset0"        "group1/dataset1"
hql_list_attributes("group1/dataset1")
## [1] "abb"

Datasets, attributes, and groups can be removed from an HDF file via the functions hql_drop_dataset, hql_drop_attribute, and hql_drop_group.

hql_drop_dataset("dataset0")

Once you're finished working with the HDF objects, close the file with hql_close_file:

hql_close_file(file)