hdf5.js

import * as utils from "./utils.js";
import * as wasm from "./wasm.js";
import * as packer from "./internal/pack_strings.js";
import * as fac from "./factorize.js";

function check_shape(x, shape) {
    if (shape.length > 0) {
        let full_length = shape.reduce((a, b) => a * b);
        if (x.length != full_length) {
            throw new Error("length of 'x' must be equal to the product of 'shape'");
        }
    } else {
        if (x instanceof Array || ArrayBuffer.isView(x)) {
            if (x.length != 1) {
                throw new Error("length of 'x' should be 1 for a scalar dataset");
            }
        } else {
            x = [x];
        }
    }
    return x;
}

function guess_shape(x, shape) {
    if (shape === null) {
        if (typeof x == "string" || typeof x == "number") {
            x = [x];
            shape = []; // scalar, I guess.
        } else {
            shape = [x.length];
        }
    } else {
        x = check_shape(x, shape);
    }
    return { x: x, shape: shape };
}

function forbid_strings(x) {
    if (Array.isArray(x)) {
        // no strings allowed!
        for (const x0 of x) {
            if (typeof x0 === "string") {
                throw new Error("'x' should not contain any strings for a non-string HDF5 dataset");
            }
        }
    }
}

function fetch_max_string_length(lengths) {
    let maxlen = 0;
    lengths.array().forEach(y => {
        if (maxlen < y) {
            maxlen = y;
        }
    });
    return maxlen;
}

function process_enum_input(values, levels, arg) {
    if (levels !== null) {
        for (const i of values) {
            if (!Number.isInteger(i) || i < 0 || i >= levels.length) {
                throw new Error("'" + arg + "' must contain non-negative integers less than 'levels.length' for 'type = \"Enum\"'");
            }
        }

    } else {
        let out = fac.convertToFactor(values, { asWasmArray: false });
        values = out.ids;
        levels = out.levels;
        for (const l of levels) {
            if (typeof l !== "string") {
                throw new Error("'" + arg + "' should only contain string levels for 'type = \"Enum\"'");
            }
        }
    }

    return { values: values, levels: levels }
}

/**
 * Base class for HDF5 objects.
 */
export class H5Base {
    #file;
    #name;
    #attributes;

    /**
     * @param {string} file - Path to the HDF5 file.
     * @param {string} name - Name of the object inside the file.
     */
    constructor(file, name) {
        this.#file = file;
        this.#name = name;
    }

    /**
     * @member {string}
     * @desc Path to the HDF5 file.
     */
    get file() {
        return this.#file;
    }

    /**
     * @member {string}
     * @desc Name of the object inside the file.
     */
    get name() {
        return this.#name;
    }

    /**
     * @member {Array}
     * @desc Array containing the names of all attributes of this object.
     */
    get attributes() {
        return this.#attributes;
    }

    set_attributes(attributes) { // internal use only, for subclasses.
        this.#attributes = attributes;
    }

    /**
     * Read an attribute of the object.
     *
     * @param {string} attr - Name of the attribute.
     * @return {object} Object containing the attribute `values` and the `shape` of the attribute.
     * For HDF5 enums, an additional `level` property is present, containing the levels indexed by the integer `values`.
     */
    readAttribute(attr) {
        let output = { values: null, shape: null };

        let x = wasm.call(module => new module.LoadedH5Attr(this.file, this.name, attr));
        try {
            output.shape = Array.from(x.shape());

            let type = x.type();
            if (type == "other") {
                throw new Error("cannot load dataset for an unsupported type");
            }

            if (type == "String") {
                output.values = packer.unpack_strings(x.string_buffer(), x.string_lengths());
            } else if (type == "Enum") {
                output.values = x.numeric_values().slice();
                output.levels = packer.unpack_strings(x.string_buffer(), x.string_lengths());
            } else {
                output.values = x.numeric_values().slice();
            }
        } finally {
            x.delete();
        }

        return output;
    }

    #create_attribute(attr, type, shape, { maxStringLength = null, levels = null } = {}) { // internal use only.
        let shape_arr = utils.wasmifyArray(shape, "Int32WasmArray");
        try {
            if (type == "String") {
                wasm.call(module => module.create_string_hdf5_attribute(this.file, this.name, attr, shape_arr.length, shape_arr.offset, maxStringLength));
            } else if (type == "Enum") {
                if (levels == null) {
                    throw new Error("levels must be supplied if 'type = \"Enum\"'");
                }
                let [ lengths, buffer ] = packer.repack_strings(levels);
                wasm.call(module => module.create_enum_hdf5_attribute(this.file, this.name, attr, shape_arr.length, shape_arr.offset, lengths.length, lengths.offset, buffer.offset));
            } else {
                wasm.call(module => module.create_numeric_hdf5_attribute(this.file, this.name, attr, shape_arr.length, shape_arr.offset, type));
            }
            this.#attributes.push(attr);
        } finally {
            shape_arr.free();
        }
    }

    /**
     * Write an attribute for the object.
     *
     * @param {string} attr - Name of the attribute.
     * @param {string} type - Type of dataset to create.
     * This can be `"IntX"` or `"UintX"` for `X` of 8, 16, 32, or 64;
     * or `"FloatX"` for `X` of 32 or 64;
     * or `"String"`;
     * or `"Enum"`.
     * @param {?Array} shape - Array containing the dimensions of the dataset to create.
     * If set to an empty array, this will create a scalar dataset.
     * If set to `null`, this is determined from `x`.
     * @param {(TypedArray|Array|string|number)} x - Values to be written to the new dataset, see {@linkcode H5DataSet#write write}.
     * This should be of length equal to the product of `shape`;
     * unless `shape` is empty, in which case it should either be of length 1, or a single number or string.
     * @param {object} [options={}] - Optional parameters.
     * @param {?number} [options.maxStringLength=null] - Maximum length of the strings to be saved.
     * Only used when `type = "String"`.
     * If `null`, this is inferred from the maximum length of strings in `x`.
     * @param {?Array} [options.levels=null] - Array of strings containing enum levels when `type = "Enum"`.
     * If supplied, `x` should be an array of integers that index into `levels`.
     * Alternatively, `levels` may be `null`, in which case `x` should be an array of strings that is used to infer `levels`.
     */
    writeAttribute(attr, type, shape, x, { maxStringLength = null, levels = null } = {}) {
        if (x === null) {
            throw new Error("cannot write 'null' to HDF5"); 
        }

        let guessed = guess_shape(x, shape);
        x = guessed.x;
        shape = guessed.shape;

        if (type == "String") {
            let [ lengths, buffer ] = packer.repack_strings(x);
            try {
                if (maxStringLength == null) {
                    maxStringLength = fetch_max_string_length(lengths);
                }
                this.#create_attribute(attr, type, shape, { maxStringLength: maxStringLength });
                wasm.call(module => module.write_string_hdf5_attribute(this.file, this.name, attr, lengths.length, lengths.offset, buffer.offset));
            } finally {
                utils.free(buffer);
                utils.free(lengths);
            }

        } else if (type == "Enum") {
            let processed = process_enum_input(x, levels, "x");
            let y = utils.wasmifyArray(processed.values, "Int32WasmArray");
            try {
                this.#create_attribute(attr, type, shape, { levels: processed.levels });
                wasm.call(module => module.write_enum_hdf5_attribute(this.file, this.name, attr, y.offset));
            } finally {
                y.free();
            }

        } else {
            forbid_strings(x);
            let y = utils.wasmifyArray(x, null);
            try {
                this.#create_attribute(attr, type, shape);
                wasm.call(module => module.write_numeric_hdf5_attribute(this.file, this.name, attr, y.constructor.className, y.offset));
            } finally {
                y.free();
            }
        }

        return;
    }
}

/**
 * Representation of a group inside a HDF5 file.
 *
 * @augments H5Base
 */
export class H5Group extends H5Base {
    #children;
    #attributes;

    /**
     * @param {string} file - Path to the HDF5 file.
     * @param {string} name - Name of the group inside the file.
     */
    constructor(file, name, { newlyCreated = false } = {}) {
        super(file, name);

        if (newlyCreated) {
            this.#children = {};
            this.set_attributes([]);
        } else {
            let x = wasm.call(module => new module.H5GroupDetails(file, name));
            try {
                let child_names = packer.unpack_strings(x.child_buffer(), x.child_lengths());
                let child_types = x.child_types();
                let type_options = [ "Group", "DataSet", "Other" ];

                this.#children = {};
                for (var i = 0; i < child_names.length; i++) {
                    this.#children[child_names[i]] = type_options[child_types[i]];
                }

                let unpacked = packer.unpack_strings(x.attr_buffer(), x.attr_lengths());
                this.set_attributes(unpacked);
            } finally {
                x.delete();
            }
        }
    }

    /**
     * @member {object}
     * @desc An object where the keys are the names of the immediate children and the values are strings specifying the object type of each child.
     * Each string can be one of `"Group"`, `"DataSet"` or `"Other"`.
     */
    get children() {
        return this.#children;
    }

    #child_name(child) {
        let new_name = this.name;
        if (new_name != "/") {
            new_name += "/";
        } 
        new_name += child;
        return new_name;
    }

    /**
     * @param {string} name - Name of the child element to open.
     * @param {object} [options={}] - Further options to pass to the {@linkplain H5Group} or {@linkplain H5DataSet} constructors.
     *
     * @return {H5Group|H5DataSet} Object representing the child element.
     */
    open(name, options = {}) {
        let new_name = this.#child_name(name);
        if (name in this.#children) {
            if (this.#children[name] == "Group") {
                return new H5Group(this.file, new_name, options);
            } else if (this.#children[name] == "DataSet") {
                return new H5DataSet(this.file, new_name, options); 
            } else {
                throw new Error("don't know how to open '" + name + "'");
            }
        } else {
            throw new Error("no '" + name + "' child in this HDF5 Group");
        }
    }

    /**
     * @param {string} name - Name of the group to create.
     *
     * @return {@H5Group} A group is created as an immediate child of the current group.
     * A {@linkplain H5Group} object is returned representing this new group.
     * If a group already exists at `name`, it is returned directly.
     */
    createGroup(name) {
        let new_name = this.#child_name(name);
        if (name in this.children) {
            if (this.children[name] == "Group") {
                return new H5Group(this.file, new_name);
            } else {
                throw new Error("existing child '" + new_name + "' is not a HDF5 group");
            }
        } else {
            wasm.call(module => module.create_hdf5_group(this.file, new_name));
            this.children[name] = "Group";
            return new H5Group(this.file, new_name, { newlyCreated: true });
        }
    }

    /**
     * @param {string} name - Name of the dataset to create.
     * @param {string} type - Type of dataset to create.
     * This can be `"IntX"` or `"UintX"` for `X` of 8, 16, 32, or 64;
     * or `"FloatX"` for `X` of 32 or 64;
     * or `"String"`;
     * or `"Enum"`.
     * @param {Array} shape - Array containing the dimensions of the dataset to create.
     * This can be set to an empty array to create a scalar dataset.
     * @param {object} [options={}] - Optional parameters.
     * @param {number} [options.maxStringLength=10] - Maximum length of the strings to be saved.
     * Only used when `type = "String"`.
     * @param {number} [options.compression=6] - Deflate compression level.
     * @param {?Array} [options.chunks=null] - Array containing the chunk dimensions.
     * This should have length equal to `shape`, with each value being no greater than the corresponding value of `shape`.
     * If `null`, it defaults to `shape`.
     * @param {?Array} [options.levels=null] - Array of strings containing enum levels.
     * Only used (and mandatory) when `type = "Enum"`.
     *
     * @return {H5DataSet} A dataset of the specified type and shape is created as an immediate child of the current group.
     * A {@linkplain H5DataSet} object is returned representing this new dataset.
     */
    createDataSet(name, type, shape, { maxStringLength = 10, levels = null, compression = 6, chunks = null } = {}) {
        let new_name = this.#child_name(name);

        let shape_arr;
        let chunk_arr; 
        try {
            shape_arr = utils.wasmifyArray(shape, "Int32WasmArray");

            let chunk_offset = shape_arr.offset;
            if (chunks !== null) {
                chunk_arr = utils.wasmifyArray(chunks, "Int32WasmArray");
                if (chunk_arr.length != shape_arr.length) {
                    throw new Error("'chunks' and 'shape' should have the same dimensions");
                }
                chunk_offset = chunk_arr.offset;
            }

            if (type == "String") {
                wasm.call(module => module.create_string_hdf5_dataset(this.file, new_name, shape_arr.length, shape_arr.offset, compression, chunk_offset, maxStringLength));
            } else if (type == "Enum") {
                if (levels == null) {
                    throw new Error("levels must be supplied if 'type = \"Enum\"'");
                }
                let [ lengths, buffer ] = packer.repack_strings(levels);
                wasm.call(module => module.create_enum_hdf5_dataset(this.file, new_name, shape_arr.length, shape_arr.offset, compression, chunk_offset, lengths.length, lengths.offset, buffer.offset));
            } else {
                wasm.call(module => module.create_numeric_hdf5_dataset(this.file, new_name, shape_arr.length, shape_arr.offset, compression, chunk_offset, type));
            }

        } finally {
            shape_arr.free();
        }

        this.children[name] = "DataSet";
        return new H5DataSet(this.file, new_name, { newlyCreated: true, type: type, shape: shape });
    }

    /**
     * This convenience method combines {@linkcode H5Group#createDataSet createDataSet} with {@linkcode H5DataSet#write write}.
     * It is particularly useful for string types as it avoids having to specify the `maxStringLength` during creation based on the `x` used during writing.
     * 
     * @param {string} name - Name of the dataset to create.
     * @param {string} type - Type of dataset to create.
     * This can be `"IntX"` or `"UintX"` for `X` of 8, 16, 32, or 64;
     * or `"FloatX"` for `X` of 32 or 64;
     * or `"String"`.
     * @param {Array} shape - Array containing the dimensions of the dataset to create.
     * If set to an empty array, this will create a scalar dataset.
     * If set to `null`, this is determined from `x`.
     * @param {(TypedArray|Array|string|number)} x - Values to be written to the new dataset, see {@linkcode H5DataSet#write write}.
     * @param {object} [options={}] - Optional parameters.
     * @param {?Array} [options.levels=null] - Array of strings containing enum levels when `type = "Enum"`.
     * If supplied, `x` should be an array of integers that index into `levels`.
     * Alternatively, `levels` may be `null`, in which case `x` should be an array of strings that is used to infer `levels`.
     * @param {number} [options.compression=6] - Deflate compression level.
     * @param {?Array} [options.chunks=null] - Array containing the chunk dimensions.
     * This should have length equal to `shape`, with each value being no greater than the corresponding value of `shape`.
     * If `null`, it defaults to `shape`.
     * @param {boolean} [options.cache=false] - Whether to cache the written values in the returned {@linkplain H5DataSet} object.
     *
     * @return {H5DataSet} A dataset of the specified type and shape is created as an immediate child of the current group.
     * The same dataset is then filled with the contents of `x`.
     * A {@linkplain H5DataSet} object is returned representing this new dataset.
     */
     writeDataSet(name, type, shape, x, { levels = null, compression = 6, chunks = null, cache = false } = {}) {
        if (x === null) {
            throw new Error("cannot write 'null' to HDF5"); 
        }

        let guessed = guess_shape(x, shape);
        x = guessed.x;
        shape = guessed.shape;

        let handle;
        if (type == "String") {
            let [ lengths, buffer ] = packer.repack_strings(x);
            try {
                let maxlen = fetch_max_string_length(lengths);
                handle = this.createDataSet(name, "String", shape, { maxStringLength: maxlen, compression: compression, chunks: chunks });
                wasm.call(module => module.write_string_hdf5_dataset(handle.file, handle.name, lengths.length, lengths.offset, buffer.offset));
            } finally {
                utils.free(lengths);
                utils.free(buffer);
            }
            handle.cache_loaded(x, cache);

        } else if (type == "Enum") {
            let processed = process_enum_input(x, levels, "x");
            handle = this.createDataSet(name, type, shape, { levels: processed.levels, compression: compression, chunks: chunks });
            handle.write(processed.values, { cache: cache });

        } else {
            handle = this.createDataSet(name, type, shape, { compression: compression, chunks: chunks });
            handle.write(x, { cache: cache });
        }

        return handle;
    }
}

/**
 * Representation of a HDF5 file as a top-level group.
 *
 * @augments H5Group
 */
export class H5File extends H5Group {
    /**
     * @param {string} file - Path to the HDF5 file.
     */
    constructor(file, options = {}) {
        super(file, "/", options);
    }
}

/**
 * Create a new HDF5 file.
 *
 * @param {string} path - Path to the file.
 *
 * @return {H5File} A new file is created at `path`.
 * A {@linkplain H5File} object is returned.
 */
export function createNewHdf5File(path) {
    wasm.call(module => module.create_hdf5_file(path));
    return new H5File(path, { newlyCreated: true });
}

/**
 * Representation of a dataset inside a HDF5 file.
 *
 * @augments H5Base
 */
export class H5DataSet extends H5Base {
    #shape;
    #type;
    #values;
    #levels;
    #loaded;

    static #load(file, name) {
        let vals;
        let type;
        let shape;
        let attr;
        let levels = null;

        let x = wasm.call(module => new module.LoadedH5DataSet(file, name));
        try {
            type = x.type();
            if (type == "other") {
                throw new Error("cannot load dataset for an unsupported type");
            }

            if (type == "String") {
                vals = packer.unpack_strings(x.string_buffer(), x.string_lengths());
            } else if (type == "Enum") {
                vals = x.numeric_values().slice();
                levels = packer.unpack_strings(x.string_buffer(), x.string_lengths());
            } else {
                vals = x.numeric_values().slice();
            }

            shape = Array.from(x.shape());
            attr = packer.unpack_strings(x.attr_buffer(), x.attr_lengths());
        } finally {
            x.delete();
        }

        return { 
            "values": vals, 
            "type": type, 
            "shape": shape, 
            "levels": levels,
            "attributes": attr
        };
    }

    /**
     * @param {string} file - Path to the HDF5 file.
     * @param {string} name - Name of the dataset inside the file.
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean} [options.load=false] - Whether or not to load the contents of the dataset in the constructor.
     * If `false`, the contents can be loaded later with {@linkcode H5DataSet#load load}.
     */
    constructor(file, name, { load = false, newlyCreated = false, shape = null, type = null, values = null } = {}) {
        super(file, name);

        if (newlyCreated) {
            if (shape === null || type === null) {
                throw new Error("need to pass 'shape' and 'type' if 'newlyCreated = true'");
            }
            this.#shape = shape;
            this.#type = type;
            this.#values = values;
            this.#loaded = (values !== null);
            this.set_attributes([]);
        } else {
            if (!load) {
                let x = wasm.call(module => new module.H5DataSetDetails(file, name));
                try {
                    this.#type = x.type();
                    this.#shape = Array.from(x.shape());
                    this.#values = null;
                    this.set_attributes(packer.unpack_strings(x.attr_buffer(), x.attr_lengths()));
                } finally {
                    x.delete();
                }
            } else {
                let deets = H5DataSet.#load(file, name);
                this.#type = deets.type;
                this.#shape = deets.shape;
                this.#values = deets.values;
                this.#levels = deets.levels;
                this.set_attributes(deets.attributes);
            }
            this.#loaded = load;
        }
    }

    /**
     * @member {object}
     * @desc String containing the type of the dataset.
     * This may be `"IntX"` or `"UintX"` for `X` of 8, 16, 32, or 64;
     * or `"FloatX"` for `X` of 32 or 64;
     * `"String"`, `"Enum"`, or `"Other"`.
     */
    get type() {
        return this.#type;
    }

    /**
     * @member {Array}
     * @desc Array of integers containing the dimensions of the dataset.
     * If this is empty, the dataset is a scalar.
     */
    get shape() {
        return this.#shape;
    }

    /**
     * @member {boolean}
     * @desc Whether the contents of the dataset have already been loaded.
     */
    get loaded() {
        return this.#loaded;
    }

    /**
     * @member {(Array|TypedArray)}
     * @desc The contents of this dataset.
     * This has length equal to the product of {@linkcode H5DataSet#shape shape};
     * unless this dataset is scalar, in which case it has length 1.
     */
    get values() {
        return this.#values;
    }

    /**
     * @member {?Array}
     * @desc Levels of a HDF5 enum, to be indexed by the integer `values`.
     * For non-enum data, this is set to `null`.
     */
    get levels() {
        return this.#levels;
    }

    /**
     * @return {Array|TypedArray} The contents of this dataset are loaded and cached in this {@linkplain H5DataSet} object.
     * A (Typed)Array is returned containing those contents.
     */
    load() {
        if (!this.#loaded) {
            let deets = H5DataSet.#load(this.file, this.name);
            this.#values = deets.values;
            this.#loaded = true;
        }
        return this.#values;
    }

    cache_loaded(x, cache) { // internal use only.
        if (cache) {
            this.#values = x.slice();
            this.#loaded = true;
        } else {
            this.#loaded = false;
            this.#values = null;
        }
    }

    /**
     * @param {(Array|TypedArray|number|string)} x - Values to write to the dataset.
     * This should be of length equal to the product of {@linkcode H5DataSet#shape shape};
     * unless `shape` is empty, in which case it should either be of length 1, or a single number or string.
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean} [options.cache=false] - Whether to cache the written values in this {@linkplain H5DataSet} object.
     *
     * @return `x` is written to the dataset on file.
     * No return value is provided.
     */
    write(x, { cache = false } = {}) {
        if (x === null) {
            throw new Error("cannot write 'null' to HDF5"); 
        }

        x = check_shape(x, this.shape);

        if (this.type == "String") {
            let [ lengths, buffer ] = packer.repack_strings(x);
            try {
                wasm.call(module => module.write_string_hdf5_dataset(this.file, this.name, lengths.length, lengths.offset, buffer.offset));
            } finally {
                utils.free(buffer);
                utils.free(lengths);
            }
            this.cache_loaded(x, cache);

        } else if (this.type == "Enum") {
            let y = utils.wasmifyArray(x, "Int32WasmArray");
            try {
                wasm.call(module => module.write_enum_hdf5_dataset(this.file, this.name, y.offset));
            } finally {
                y.free();
            }

        } else {
            forbid_strings(x);
            let y = utils.wasmifyArray(x, null);
            try {
                wasm.call(module => module.write_numeric_hdf5_dataset(this.file, this.name, y.constructor.className, y.offset));
                this.cache_loaded(y, cache);
            } finally {
                y.free();
            }
        }

        return;
    }
}

function extract_names(host, output, recursive = true) {
    for (const [key, val] of Object.entries(host.children)) {
        if (val == "Group") {
            output[key] = {};
            if (recursive) {
                extract_names(host.open(key), output[key], recursive);
            }
        } else {
            let data = host.open(key);

            let dclass;
            if (data.type.startsWith("Uint") || data.type.startsWith("Int")) {
                dclass = "integer";
            } else if (data.type.startsWith("Float")) {
                dclass = "float";
            } else {
                dclass = data.type.toLowerCase();
            }

            output[key] = dclass + " dataset";
        }
    }
}

/**
 * Extract object names from a HDF5 file.
 *
 * @param {string} path - Path to a HDF5 file.
 * For web applications, this should be saved to the virtual filesystem with {@linkcode writeFile}.
 * @param {object} [options={}] - Optional parameters.
 * @param {string} [options.group=""] - Group to use as the root of the search.
 * If an empty string is supplied, the entire file is used as the root.
 * @param {boolean} [options.recursive=true] - Whether to recursively extract names inside child groups.
 * 
 * @return {object} Nested object where the keys are the names of the HDF5 objects and values are their types.
 * HDF5 groups are represented by nested Javascript objects in the values;
 * these nested objects are empty if `recursive = false`.
 * HDF5 datasets are represented by strings specifying the data type - i.e., `"integer"`, `"float"`, `"string"` or `"other"`.
 */
export function extractHdf5ObjectNames (path, { group = "", recursive = true } = {}) {
    var src;
    if (group == "") {
        src = new H5File(path);
    } else {
        src = new H5Group(path, group);
    }
    var output = {};
    extract_names(src, output, recursive);
    return output;
}

/**
 * Load a dataset from a HDF5 file.
 *
 * @param {string} path - Path to a HDF5 file.
 * For web applications, this should be saved to the virtual filesystem with {@linkcode writeFile}.
 * @param {string} name - Name of a dataset inside the HDF5 file.
 * 
 * @return {object} An object containing:
 * - `dimensions`, an array containing the dimensions of the dataset.
 * - `contents`, a Int32Array, Float64Array or array of strings, depending on the type of the dataset. 
 */
export function loadHdf5Dataset(path, name) {
    var x = new H5DataSet(path, name, { load: true });
    return {
        "dimensions": x.shape,
        "contents": x.values
    };
}