readers/h5ad.js - Documentation

import * as scran from "scran.js";
import * as bioc from "bioconductor";
import * as eutils from "./utils/extract.js";
import * as futils from "./utils/features.js";
import * as afile from "./abstract/file.js";

/**************************
 ******* Internals ********
 **************************/

function extract_assay_details(handle) {
    if (handle instanceof scran.H5DataSet) {
        return {
            rows: handle.shape[0],
            columns: handle.shape[1],
            sparse: false
        };

    } else if (handle instanceof scran.H5Group) {
        let shape_info = handle.readAttribute("shape");
        let enc_info = handle.readAttribute("encoding-type");
        // Yes, the flip is deliberate, because of how H5AD puts its features in the columns.
        return {
            rows: shape_info.values[1],
            columns: shape_info.values[0],
            sparse: true,
            csc: enc_info.values[0] == "csr_matrix" 
        }

    } else {
        throw new Error("unknown type for a H5AD assay matrix");
    }
}

function fetch_assay_details(handle, path) {
    let available = [];
    let types = {};
    if ("X" in handle.children) {
        available.push("X");
        types["X"] = extract_assay_details(handle.open("X"));
    } 

    if ("layers" in handle.children) {
        let lhandle = handle.open("layers");
        if (!(lhandle instanceof scran.H5Group)) {
            throw new Error("expected a 'layers' group in a H5AD file");
        }
        for (const k of Object.keys(lhandle.children)) {
            const aname = "layers/" + k;
            available.push(aname);
            types[aname] = extract_assay_details(lhandle.open(k));
        }
    }

    if (available.length == 0) {
        throw new Error("failed to find any assay in the H5AD file");
    }

    let deets = types[available[0]];
    return {
        names: available,
        rows: deets.rows,
        columns: deets.columns,
        details: types
    };
}

function load_data_frame(handle) {
    let columns = {};

    for (const [key, val] of Object.entries(handle.children)) {
        if (val == "DataSet") {
            let dhandle = handle.open(key, { load: true });
            if (dhandle.type === "Enum") {
                let dvals = dhandle.values;
                let options = new Array(dvals.length);
                let levels = dhandle.levels;
                dvals.forEach((x, i) => {
                    options[i] = levels[x];
                });
                columns[key] = options;
            } else if (dhandle.type !== "Other") {
                columns[key] = dhandle.values;
            }

        } else if (val == "Group") {
            // Factor encoding for H5AD versions >= 0.8.0.
            let subhandle = handle.open(key);
            if ("categories" in subhandle.children && "codes" in subhandle.children) {
                let current_levels = eutils.extractHdf5Strings(subhandle, "categories");
                let codes = subhandle.open("codes", { load: true }).values;
                columns[key] = bioc.SLICE(current_levels, codes);
            }
        }
    }

    // Factor encoding for H5AD versions < 0.8.0.
    if ("__categories" in handle.children && handle.children["__categories"] == "Group") {
        let chandle = handle.open("__categories");

        for (const [key, val] of Object.entries(chandle.children)) {
            if (key in columns) {
                let current_levels = eutils.extractHdf5Strings(chandle, key);
                if (!current_levels) {
                    console.warn(`ignoring invalid levels for nominally categorical key '${key}' in the H5AD reader`)
                    continue;
                }
                columns[key] = bioc.SLICE(current_levels, columns[key]);
            }
        }
    }

    if (Object.keys(columns).length == 0) {
        return null;
    } else {
        let rn = null;
        if ("_index" in columns) {
            rn = columns._index;
            delete columns._index;
        }
        return new bioc.DataFrame(columns, { rowNames: rn });
    }
}

function fetch_features(handle) {
    if ("var" in handle.children && handle.children["var"] == "Group") {
        let vhandle = handle.open("var");
        return load_data_frame(vhandle);
    }
    return null;
}

function fetch_cells(handle) {
    if ("obs" in handle.children && handle.children["obs"] == "Group") {
        let ohandle = handle.open("obs");
        return load_data_frame(ohandle);
    }
    return null;
}

function load_matrix(path, name, details, options) {
    if (details.sparse) {
        return scran.initializeSparseMatrixFromHdf5Group(path, name, details.rows, details.columns, !(details.csc), options);
    } else {
        return scran.initializeSparseMatrixFromHdf5Dataset(path, name, options);
    }
}

/************************
 ******* Dataset ********
 ************************/

/**
 * Dataset in the H5AD format.
 */
export class H5adDataset {
    #h5_file;
    #h5_path;
    #h5_flush;
    #h5_handle;

    #raw_features;
    #raw_cells;
    #assay_details;

    #options;

    #dump_summary(fun) {
        let files = [{ type: "h5", file: fun(this.#h5_file) }]; 
        let options = this.options();
        return { files, options };
    }

    /**
     * @param {SimpleFile|string|Uint8Array|File} h5File - Contents of a H5AD file.
     * On browsers, this may be a File object.
     * On Node.js, this may also be a string containing a file path.
     */
    constructor(h5File) {
        if (h5File instanceof afile.SimpleFile) {
            this.#h5_file = h5File;
        } else {
            this.#h5_file = new afile.SimpleFile(h5File);
        }

        this.#options = H5adDataset.defaults();
        this.clear();
    }

    /**
     * @return {object} Default options, see {@linkcode H5adDataset#setOptions setOptions} for more details.
     */
    static defaults() {
        return {
            countMatrixName: null, 
            featureTypeColumnName: null, 
            featureTypeRnaName: "Gene Expression", 
            featureTypeAdtName: "Antibody Capture", 
            featureTypeCrisprName: "CRISPR Guide Capture", 
            primaryRnaFeatureIdColumn: null,
            primaryAdtFeatureIdColumn: null,
            primaryCrisprFeatureIdColumn: null
        };
    }

    /**
     * @return {object} Object containing all options used for loading.
     */
    options() {
        return { ...(this.#options) };
    }

    /**
     * @param {object} options - Optional parameters that affect {@linkcode H5adDataset#load load} (but not {@linkcode H5adDataset#summary summary}).
     * @param {?string} [options.countMatrixName] - Name of the layer containing the count matrix.
     * If `null`, the "X" dataset is used if it is present in the file, or the first available layer if no "X" dataset is present.
     * @param {?string} [options.featureTypeColumnName] - Name of the per-feature annotation column containing the feature types.
     * If `null`, no column is assumed to contain feature type information.
     * @param {?string} [options.featureTypeRnaName] - Name of the feature type for gene expression.
     * If `null` or the string is not present among the feature types, no RNA features are to be loaded.
     *
     * If no feature type information is available in the dataset (i.e., `featureTypeColumnName = null`), all features are considered to be genes by default.
     * This behavior can also be explicitly requested by setting this argument to the only non-`null` value among all `featureType*Name` parameters.
     * @param {?string} [options.featureTypeAdtName] - Name of the feature type for ADTs.
     * If `null` or the string is not present among the feature types, no ADT features are to be loaded.
     *
     * If no feature type information is available in the dataset and this argument is set to the only non-`null` value among all `featureType*Name` parameters, all features are considered to be ADTs.
     * @param {?string} [options.featureTypeCrisprName] - Name of the feature type for CRISPR guides.
     * If `null` or the string is not present among the feature types, no guides are to be loaded.
     *
     * If no feature type information is available in the dataset and this argument is set to the only non-`null` value among all `featureType*Name` parameters, all features are considered to be guides.
     * @param {?(string|number)} [options.primaryRnaFeatureIdColumn] - Name or index of the column of the `features` {@linkplain external:DataFrame DataFrame} that contains the primary feature identifier for gene expression.
     *
     * If `i` is `null` or invalid (e.g., out of range index, unavailable name), it is ignored and the row names (from the `_index` group) are used as the primary identifiers.
     * If no row names are present in this situation, no primary identifier is defined.
     * @param {?(string|number)} [options.primaryAdtFeatureIdColumn] - Name or index of the column of the `features` {@linkplain external:DataFrame DataFrame} that contains the primary feature identifier for the ADTs.
     *
     * If `i` is `null` or invalid (e.g., out of range index, unavailable name), it is ignored and the row names (from the `_index` group) are used as the primary identifiers.
     * If no row names are present in this situation, no primary identifier is defined.
     * @param {?(string|number)} [options.primaryCrisprFeatureIdColumn] - Name or index of the column of the `features` {@linkplain external:DataFrame DataFrame} that contains the primary feature identifier for the CRISPR guides.
     *
     * If `i` is `null` or invalid (e.g., out of range index, unavailable name), it is ignored and the row names (from the `_index` group) are used as the primary identifiers.
     * If no row names are present in this situation, no primary identifier is defined.
     */
    setOptions(options) {
        for (const [k, v] of Object.entries(options)) {
            this.#options[k] = v;
        }
    }

    #instantiate() {
        if (this.#h5_path != null) {
            return;
        }

        let info = scran.realizeFile(this.#h5_file.content());
        this.#h5_path = info.path;
        this.#h5_flush = info.flush;
        this.#h5_handle = new scran.H5File(this.#h5_path);
    }

    /**
     * Destroy caches if present, releasing the associated memory. 
     * This may be called at any time but only has an effect if `cache = true` in {@linkcode H5adDataset#load load} or {@linkcodeH5adDataset#summary summary}.
     */
    clear() {
        if (typeof this.#h5_flush == "function") {
            this.#h5_flush();
        }
        this.#h5_flush = null;
        this.#h5_path = null;
        this.#h5_handle = null;

        this.#raw_features = null;
        this.#raw_cells = null;
        this.#assay_details = null;
    }

    /**
     * @return {string} Format of this dataset class.
     * @static
     */
    static format() {
        return "H5AD";
    }

    /**
     * @return {object} Object containing the abbreviated details of this dataset.
     */
    abbreviate() {
        return this.#dump_summary(f => { return { name: f.name(), size: f.size() }; });
    }

    #fetch_assay_details() {
        if (this.#assay_details !== null) {
            return;
        }
        this.#instantiate();
        this.#assay_details = fetch_assay_details(this.#h5_handle, this.#h5_path);
    }

    #features() {
        if (this.#raw_features !== null) {
            return;
        }
        this.#instantiate();

        let feats = fetch_features(this.#h5_handle);
        if (feats == null) {
            this.#fetch_assay_details();
            feats = new bioc.DataFrame({}, { numberOfRows: this.#assay_details.rows });
        }

        this.#raw_features = feats;
        return;
    }

    #cells() {
        if (this.#raw_cells !== null) {
            return;
        }
        this.#instantiate();

        let cells = fetch_cells(this.#h5_handle);
        if (cells === null) {
            this.#fetch_assay_details();
            cells = new bioc.DataFrame({}, { numberOfRows: this.#assay_details.columns })
        }

        this.#raw_cells = cells;
        return;
    }

    /**
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean} [options.cache=false] - Whether to cache the intermediate results for re-use in subsequent calls to any methods with a `cache` option.
     * If `true`, users should consider calling {@linkcode H5adDataset#clear clear} to release the memory once this dataset instance is no longer needed.
     *
     * @return {object} Object containing the per-feature and per-cell annotations.
     * This has the following properties:
     *
     * - `all_features`: a {@linkplain external:DataFrame DataFrame} of per-feature annotations.
     * - `cells`: a {@linkplain external:DataFrame DataFrame} of per-cell annotations.
     * - `all_assay_names`: an Array of strings containing names of potential count matrices.
     */
    summary({ cache = false } = {}) {
        this.#features();
        this.#cells();
        this.#fetch_assay_details();

        let output = {
            all_features: this.#raw_features,
            cells: this.#raw_cells,
            all_assay_names: this.#assay_details.names
        };

        if (!cache) {
            this.clear();
        }
        return output;
    }

    #feature_type_mapping() {
        return {
            RNA: this.#options.featureTypeRnaName, 
            ADT: this.#options.featureTypeAdtName,
            CRISPR: this.#options.featureTypeCrisprName
        };
    }

    #primary_mapping() {
        return {
            RNA: this.#options.primaryRnaFeatureIdColumn, 
            ADT: this.#options.primaryAdtFeatureIdColumn,
            CRISPR: this.#options.primaryCrisprFeatureIdColumn
        };
    }

    /**
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean} [options.cache=false] - Whether to cache the intermediate results for re-use in subsequent calls to any methods with a `cache` option.
     * If `true`, users should consider calling {@linkcode H5adDataset#clear clear} to release the memory once this dataset instance is no longer needed.
     *
     * @return {object} An object where each key is a modality name and each value is an array (usually of strings) containing the primary feature identifiers for each row in that modality.
     * The contents are the same as the `primary_ids` returned by {@linkcode H5adDataset#load load} but the order of values may be different.
     */
    previewPrimaryIds({ cache = false } = {}) {
        this.#features();
        let preview = futils.extractSplitPrimaryIds(this.#raw_features, this.#options.featureTypeColumnName, this.#feature_type_mapping(), "RNA", this.#primary_mapping());
        if (!cache) {
            this.clear();
        }
        return preview;
    }

    /**
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean} [options.cache=false] - Whether to cache the intermediate results for re-use in subsequent calls to any methods with a `cache` option.
     * If `true`, users should consider calling {@linkcode H5adDataset#clear clear} to release the memory once this dataset instance is no longer needed.
     *
     * @return {object} Object containing the per-feature and per-cell annotations.
     * This has the following properties:
     *
     * - `features`: an object where each key is a modality name and each value is a {@linkplain external:DataFrame DataFrame} of per-feature annotations for that modality.
     * - `cells`: a {@linkplain external:DataFrame DataFrame} containing per-cell annotations.
     * - `matrix`: a {@linkplain external:MultiMatrix MultiMatrix} containing one {@linkplain external:ScranMatrix ScranMatrix} per modality.
     * - `primary_ids`: an object where each key is a modality name and each value is an array (usually of strings) containing the primary feature identifiers for each row in that modality.
     *
     * Modality names are guaranteed to be one of `"RNA"`, `"ADT"` or `"CRISPR"`.
     * We assume that the instance already contains an appropriate mapping from the observed feature types to each expected modality,
     * either from the {@linkcode H5adDataset#defaults defaults} or with {@linkcode H5adDataset#setOptions setOptions}.
     */
    load({ cache = false } = {}) {
        this.#features();
        this.#cells();
        this.#fetch_assay_details();

        let chosen_assay = this.#options.countMatrixName;
        if (chosen_assay == null) {
            chosen_assay = this.#assay_details.names[0];
        }

        let loaded = load_matrix(this.#h5_path, chosen_assay, this.#assay_details.details[chosen_assay], { forceInteger: true, layered: true }); 
        let output = futils.splitScranMatrixAndFeatures(loaded, this.#raw_features, this.#options.featureTypeColumnName, this.#feature_type_mapping(), "RNA");
        output.cells = this.#raw_cells;

        output.primary_ids = futils.extractPrimaryIds(output.features, this.#primary_mapping());

        if (!cache) {
            this.clear();
        }
        return output;
    }

    /**
     * @return {object} Object describing this dataset, containing:
     *
     * - `files`: Array of objects representing the files used in this dataset.
     *   Each object corresponds to a single file and contains:
     *   - `type`: a string denoting the type.
     *   - `file`: a {@linkplain SimpleFile} object representing the file contents.
     * - `options`: An object containing additional options to saved.
     */
    serialize() {
        return this.#dump_summary(f => f);
    }

    /**
     * @param {Array} files - Array of objects like that produced by {@linkcode H5adDataset#serialize serialize}.
     * @param {object} options - Object containing additional options to be passed to the constructor.
     * @return {H5adDataset} A new instance of this class.
     * @static
     */
    static async unserialize(files, options) {
        if (files.length != 1 || files[0].type != "h5") {
            throw new Error("expected exactly one file of type 'h5' for H5AD unserialization");
        }
        let output = new H5adDataset(files[0].file);
        output.setOptions(options);
        return output;
    }
}

/************************
 ******* Results ********
 ************************/

/**
 * Pre-computed analysis results in the H5AD format.
 */
export class H5adResult {
    #h5_file;
    #h5_path;
    #h5_flush;
    #h5_handle;

    #raw_features;
    #raw_cells;
    #assay_details;
    #reddim_details;

    #options;

    /**
     * @param {SimpleFile|string|Uint8Array|File} h5File - Contents of a H5AD file.
     * On browsers, this may be a File object.
     * On Node.js, this may also be a string containing a file path.
     */
    constructor(h5File) {
        if (h5File instanceof afile.SimpleFile) {
            this.#h5_file = h5File;
        } else {
            this.#h5_file = new afile.SimpleFile(h5File);
        }

        this.#options = H5adResult.defaults();
        this.clear();
    }

    /**
     * @return {object} Default options, see {@linkcode H5adResult#setOptions setOptions} for more details.
     */
    static defaults() {
        return { 
            primaryMatrixName: null, 
            isPrimaryNormalized: true,
            featureTypeColumnName: null, 
            reducedDimensionNames: null
        };
    }

    /**
     * @return {object} Object containing all options used for loading.
     */
    options() {
        return { ...(this.#options) };
    }

    /**
     * @param {object} options - Optional parameters that affect {@linkcode H5adResult#load load} (but not {@linkcode H5adResult#summary summary}).
     * @param {?string} [options.primaryMatrixName] - Name of the layer containing the primary matrix.
     * If `null`, the "X" dataset is used if it is present in the file, or the first available layer if no "X" dataset is present.
     * @param {boolean} [options.isPrimaryNormalized] - Whether the primary matrix is already normalized.
     * If `false`, it is assumed to contain count data and is subjected to library size normalization within each modality.
     * @param {?string} [options.featureTypeColumnName] - Name of the per-feature annotation column containing the feature types.
     * If `null`, no column is assumed to contain the feature types, and all features are assumed to be genes (i.e., only the RNA modality is present).
     * @param {?Array} [options.reducedDimensionNames=null] - Array of names of the reduced dimensions to load.
     * If `null`, all reduced dimensions found in the file are loaded.
     */
    setOptions(options) {
        for (const [k, v] of Object.entries(options)) {
            if (k == "reducedDimensionNames") {
                this.#options[k] = bioc.CLONE(v); // avoid pass-by-reference links.
            } else {
                this.#options[k] = v;
            }
        }
    }

    /**
     * Destroy caches if present, releasing the associated memory. 
     * This may be called at any time but only has an effect if `cache = true` in {@linkcode H5adResult#load load} or {@linkcode H5adResult#summary summary}.
     */
    clear() {
        if (typeof this.#h5_flush == "function") {
            this.#h5_flush();
        }
        this.#h5_flush = null;
        this.#h5_path = null;
        this.#h5_handle = null;

        this.#raw_features = null;
        this.#raw_cells = null;
        this.#assay_details = null;
        this.#reddim_details = null;
    }

    #instantiate() {
        if (this.#h5_path != null) {
            return;
        }

        let info = scran.realizeFile(this.#h5_file.content());
        this.#h5_path = info.path;
        this.#h5_flush = info.flush;
        this.#h5_handle = new scran.H5File(this.#h5_path);
    }

    #fetch_assay_details() {
        if (this.#assay_details !== null) {
            return;
        }
        this.#instantiate();
        this.#assay_details = fetch_assay_details(this.#h5_handle, this.#h5_path);
        return;
    }

    #features() {
        if (this.#raw_features !== null) {
            return;
        }
        this.#instantiate();

        let feats = fetch_features(this.#h5_handle);
        if (feats == null) {
            this.#fetch_assay_details();
            feats = new bioc.DataFrame({}, { numberOfRows: this.#assay_details.rows });
        }

        this.#raw_features = feats;
        return;
    }

    #cells() {
        if (this.#raw_cells !== null) {
            return;
        }
        this.#instantiate();

        let cells = fetch_cells(this.#h5_handle);
        if (cells === null) {
            this.#fetch_assay_details();
            cells = new bioc.DataFrame({}, { numberOfRows: this.#assay_details.columns })
        }

        this.#raw_cells = cells;
        return;
    }

    #fetch_reddim_details() {
        if (this.#reddim_details !== null) {
            return;
        }
        this.#instantiate();
        
        let available = [];
        if ("obsm" in this.#h5_handle.children && this.#h5_handle.children["obsm"] == "Group") {
            let ohandle = this.#h5_handle.open("obsm");
            for (const [k, v] of Object.entries(ohandle.children)) {
                if (v == "DataSet") {
                    available.push(k);
                }
            }
        }

        this.#reddim_details = { names: available };
        return;
    }

    /**
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean} [options.cache=false] - Whether to cache the results for re-use in subsequent calls to this method or {@linkcode H5adResult#load load}.
     * If `true`, users should consider calling {@linkcode H5adResult#clear clear} to release the memory once this dataset instance is no longer needed.
     *
     * @return {object} Object containing the per-feature and per-cell annotations.
     * This has the following properties:
     *
     * - `all_features`: a {@linkplain external:DataFrame DataFrame} of per-feature annotations.
     * - `cells`: a {@linkplain external:DataFrame DataFrame} of per-cell annotations.
     * - `all_assay_names`: an Array of strings containing names of potential primary matrices.
     * - `reduced_dimension_names`: an Array of strings containing names of dimensionality reduction results.
     */
    summary({ cache = false } = {}) {
        this.#features();
        this.#cells();
        this.#fetch_assay_details();
        this.#fetch_reddim_details();

        let output = {
            all_features: this.#raw_features,
            cells: this.#raw_cells,
            all_assay_names: this.#assay_details.names,
            reduced_dimension_names: this.#reddim_details.names
        };

        if (!cache) {
            this.clear();
        }
        return output;
    }

    /**
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean} [options.cache=false] - Whether to cache the results for re-use in subsequent calls to this method or {@linkcode H5adResult#summary summary}.
     * If `true`, users should consider calling {@linkcode H5adResult#clear clear} to release the memory once this dataset instance is no longer needed.
     *
     * @return {object} Object containing the per-feature and per-cell annotations.
     * This has the following properties:
     *
     * - `features`: an object where each key is a modality name and each value is a {@linkplain external:DataFrame DataFrame} of per-feature annotations for that modality.
     * - `cells`: a {@linkplain external:DataFrame DataFrame} containing per-cell annotations.
     * - `matrix`: a {@linkplain external:MultiMatrix MultiMatrix} containing one {@linkplain external:ScranMatrix ScranMatrix} per modality.
     * - `reduced_dimensions`: an object containing the dimensionality reduction results.
     *   Each value is an array of arrays, where each inner array contains the coordinates for one dimension.
     */
    load({ cache = false } = {}) {
        this.#features();
        this.#cells();
        this.#fetch_assay_details();
        this.#fetch_reddim_details();

        let chosen_assay = this.#options.primaryMatrixName;
        if (chosen_assay == null) {
            chosen_assay = this.#assay_details.names[0];
        }
        let loaded = load_matrix(this.#h5_path, chosen_assay, this.#assay_details.details[chosen_assay], { forceInteger: !this.#options.isPrimaryNormalized });
        let output = futils.splitScranMatrixAndFeatures(loaded, this.#raw_features, this.#options.featureTypeColumnName, null, "");
        output.cells = this.#raw_cells;

        if (!this.#options.isPrimaryNormalized) {
            for (const mod of output.matrix.available()) {
                let mat = output.matrix.get(mod);
                output.matrix.add(mod, scran.normalizeCounts(mat, { allowZeros: true }));
            }
        }

        // Loading the dimensionality reduction results.
        let chosen_reddims = this.#options.reducedDimensionNames;
        if (chosen_reddims == null) {
            chosen_reddims = this.#reddim_details.names;
        }

        let reddims = {};
        if (chosen_reddims.length) {
            let ohandle = this.#h5_handle.open("obsm");
            for (const k of chosen_reddims) {
                let loaded = ohandle.open(k, { load: true });
                let shape = loaded.shape;
                let ncells = shape[0];
                let ndims = shape[1];
                let transposed = scran.transposeMatrix(ncells, ndims, loaded.values, { columnMajor: false }); // HDF5 stores matrices in row-major format.
                let contents = [];
                for (var d = 0; d < ndims; d++) {
                    let offset = d * ncells;
                    contents.push(transposed.slice(offset, offset + ncells));
                }
                reddims[k] = contents;
            }
        }
        output.reduced_dimensions = reddims;
        
        if (!cache) {
            this.clear();
        }
        return output;
    }

}