steps/rna_quality_control.js

import * as scran from "scran.js"; 
import * as bioc from "bioconductor";
import * as utils from "./utils/general.js";
import * as inputs_module from "./inputs.js";
import * as rutils from "../readers/index.js";

const baseUrl = "https://github.com/kanaverse/kana-special-features/releases/download/v1.0.0";

export const step_name = "rna_quality_control";

/**
 * Results of computing per-cell RNA-derived QC metrics,
 * see [here](https://kanaverse.github.io/scran.js/PerCellRnaQcMetricsResults.html) for details.
 *
 * @external PerCellRnaQcMetricsResults
 */

/**
 * Suggested filters for the RNA-derived QC metrics,
 * see [here](https://kanaverse.github.io/scran.js/SuggestRnaQcFiltersResults.html) for details.
 *
 * @external SuggestRnaQcFiltersResults
 */

/**
 * This step applies quality control on the RNA count matrix.
 * Specifically, it computes the QC metrics and filtering thresholds, 
 * wrapping the [`perCellRnaQcMetrics`](https://kanaverse.github.io/scran.js/global.html#perCellRnaQcMetrics)
 * and [`suggestRnaQcFilters`](https://kanaverse.github.io/scran.js/global.html#suggestRnaQcFilters) functions
 * from [**scran.js**](https://github.com/kanaverse/scran.js).
 * Note that the actual filtering is done by {@linkplain CellFilteringState}.
 *
 * Methods not documented here are not part of the stable API and should not be used by applications.
 * @hideconstructor
 */
export class RnaQualityControlState {
    #inputs;
    #cache;
    #parameters;

    constructor(inputs, parameters = null, cache = null) {
        if (!(inputs instanceof inputs_module.InputsState)) {
            throw new Error("'inputs' should be an InputsState object");
        }
        this.#inputs = inputs;

        this.#parameters = (parameters === null ? {} : parameters);
        this.#cache = (cache === null ? {} : cache);
        this.changed = false;
    }

    free() {
        utils.freeCache(this.#cache.metrics);
        utils.freeCache(this.#cache.filters);
        utils.freeCache(this.#cache.metrics_buffer);
        utils.freeCache(this.#cache.discard_buffer);
    }

    /***************************
     ******** Getters **********
     ***************************/
    
    valid() {
        let input = this.#inputs.fetchCountMatrix();
        return input.has("RNA");
    }

    /**
     * @return {object} Object containing the parameters.
     */
    fetchParameters() {
        let output = { ...this.#parameters }; // avoid pass-by-reference links.
        output.species = bioc.CLONE(output.species);
        return output;
    }

    /**
     * @return {Uint8WasmArray} Buffer containing the discard vector of length equal to the number of cells,
     * where each element is truthy if the corresponding cell is to be discarded.
     */
    fetchDiscards() {
        return this.#cache.discard_buffer;
    }

    /**
     * @return {external:SuggestRnaQcFiltersResults} Result of filtering on the RNA-derived QC metrics.
     */
    fetchFilters() {
        return this.#cache.filters;
    }

    /**
     * @return {external:PerCellRnaQcMetricsResults} RNA-derived QC metrics.
     */
    fetchMetrics() {
        return this.#cache.metrics;
    }

    /****************************
     ******** Defaults **********
     ****************************/

    static defaults () {
        return {
            guess_ids: true,
            gene_id_column: null,
            use_reference_mito: true,
            species: [],
            gene_id_type: "ENSEMBL",
            mito_prefix: "mt-",

            filter_strategy: "automatic", 
            nmads: 3,

            sum_threshold: 100,
            detected_threshold: 100,
            mito_threshold: 0.2
        };
    }

    static configureFeatureParameters(use_reference_mito, guesses) {
        let best_key = null;
        let best = { type: "symbol", species: "human", confidence: 0 };

        if ("row_names" in guesses) {
            let val = guesses.row_names;
            if (val.confidence > best.confidence && (use_reference_mito || val.type == "symbol")) {
                best = val;
            }
        }

        for (const [key, val] of Object.entries(guesses.columns)) {
            if (val.confidence > best.confidence && (use_reference_mito || val.type == "symbol")) {
                best = val;
                best_key = key;
            }
        }

        return {
            gene_id_column: best_key,
            species: [best.species],
            gene_id_type: best.type.toUpperCase()
        };
    }

    /**
     * Array of strings containing the taxonomy IDs for species where mitochondrial gene lists are available.
     * @type {Array}
     */
    static mitochondriaSpecies = [ 
        "9606",  // Mouse
        "10090", // Human
        "6239",  // C. elegans
        "10116", // Rat
        "9541",  // M. fascicularis
        "7227",  // Fly
        "7955",  // Zebrafish
        "9598"   // Chimp
    ];

    /***************************
     ******** Remotes **********
     ***************************/

    async #acquire_reference(species, feature_type) {
        let output = new Set;
        let mito_lists = RnaQualityControlState.#mito_lists;

        for (const s of species) {
            let target = s + "-mito-" + feature_type.toLowerCase() + ".txt.gz";
            if (!(target in mito_lists)) {
                let contents = await RnaQualityControlState.#downloadFun(baseUrl + "/" + target);
                let lines = await rutils.readLines2(contents, { compression: "gz" });
                mito_lists[target] = lines;
            }

            mito_lists[target].forEach(x => { output.add(x); });
        }

        return output;
    }

    static #mito_lists = {};

    /**
     * Flush all cached lists of mitochondrial genes.
     *
     * By default, {@linkcode RnaQualityControlState#compute compute} will cache the mitochondrial gene lists in a static member for re-use across {@linkplain RnaQualityControlState} instances.
     * These cached lists are not tied to any single instance and will not be removed by garbage collectors or by {@linkcode freeAnalysis}.
     * Rather, this function should be called to release the relevant memory.
     */
    static flush() {
        RnaQualityControlState.#mito_lists = {};
        return;
    }

    static #downloadFun = utils.defaultDownload;

    /**
     * Specify a function to download the reference mitochondrial gene lists.
     *
     * @param {function} fun - Function that accepts a single string containing a URL and returns any value that can be used in the {@linkplain SimpleFile} constructor.
     * This is most typically a Uint8Array of that URL's contents, but it can also be a path to a locally cached file on Node.js.
     *
     * @return `fun` is set as the global downloader for this step. 
     * The _previous_ value of the downloader is returned.
     */
    static setDownload(fun) {
        let previous = RnaQualityControlState.#downloadFun;
        RnaQualityControlState.#downloadFun = fun;
        return previous;
    }

    /***************************
     ******** Compute **********
     ***************************/

    /**
     * This method should not be called directly by users, but is instead invoked by {@linkcode runAnalysis}.
     *
     * @param {object} parameters - Parameter object, equivalent to the `rna_quality_control` property of the `parameters` of {@linkcode runAnalysis}.
     * @param {boolean} parameters.guess_ids - Automatically choose feature-based parameters based on the feature annotation for the RNA modality.
     * If set to `true`, the following logic is applied:
     *
     * - If `use_reference_mito = true`, the annotation column that best matches human/mouse Ensembl/symbols is set as `gene_id_column`.
     *   Based on the identified species and feature type, `species` and `gene_id_type` are also set.
     * - If `use_reference_mito = false`, the annotation column that best matches human/mouse symbols is set as `gene_id_column`.
     *
     * @param {?(string|number)} parameters.gene_id_column - Name or index of the column of the feature annotations that contains the gene identifiers for the RNA modality.
     * If `null`, the row names are used.
     * Ignored if `guess_ids = true`.
     * @param {boolean} parameters.use_reference_mito - Whether to use the reference lists of mitochondrial genes.
     * If `false`, mitochondrial genes are instead identified from their prefix.
     * @param {Array} parameters.species - Array of strings specifying zero, one or more species to use to obtain a reference list of mitochondrial genes.
     * Each entry should be a taxonomy ID (e.g. `"9606"`, `"10090"`) as specified in {@linkcode RnaQualityControlState#mitochondriaSpecies mitochondriaSpecies}).
     * Ignored if `guess_ids = true`.
     * @param {string} parameters.gene_id_type - Name of the feature type in the reference list of mitochondrial genes.
     * This can be any one of `"ENSEMBL"`, `"SYMBOL"`, or `"ENTREZ"`.
     * Ignored if `guess_ids = true`.
     * @param {?string} parameters.mito_prefix - Case-insensitive prefix to use to identify mitochondrial genes from the dataset.
     * Only used when `use_reference_mito = false`; in such cases, `gene_id_column` should point to symbols.
     * If `null`, no prefix-based identification is performed.
     * @param {string} parameters.filter_strategy - Strategy for defining a filter threshold for the QC metrics.
     * This can be `"automatic"` or `"manual"`.
     * @param {number} parameters.nmads - Number of MADs to use for automatically selecting the filter threshold for each metric.
     * Only used when `filter_strategy = "automatic"`.
     * @param {number} parameters.sum_threshold - Manual threshold on the sum of counts for each cell.
     * Cells are only retained if their sums are equal to or greater than this threshold.
     * Only used when `filter_strategy = "manual"`.
     * @param {number} parameters.detected_threshold - Manual threshold on the detected number of features for each cell.
     * Cells are only retained if the detected number is equal to or greater than this threshold.
     * Only used when `filter_strategy = "manual"`.
     * @param {number} parameters.mito_threshold - Manual threshold on the mitochondrial proportion for each cell.
     * Cells are only retained if their totals are less than or equal to this threshold.
     * Only used when `filter_strategy = "manual"`.
     *
     * @return The object is updated with the new results.
     * @async
     */
    async compute(parameters) {
        let { guess_ids, gene_id_column, use_reference_mito, species, gene_id_type, mito_prefix, filter_strategy, nmads, sum_threshold, detected_threshold, mito_threshold } = parameters;

        // Some back-compatibility here.
        if (typeof guess_ids == "undefined") {
            if ("automatic" in parameters) {
                guess_ids = parameters.automatic;
            } else {
                guess_ids = true;
                use_reference_mito = parameters.use_mito_default;
                let def = RnaQualityControlState.defaults();
                gene_id_column = def.gene_id_column;
                species = def.species;
                gene_id_type = def.gene_id_type;
            }
        }

        this.changed = false;

        if (
            this.#inputs.changed || 
            guess_ids !== this.#parameters.guess_ids ||
            use_reference_mito !== this.#parameters.use_reference_mito || 
            (
                !guess_ids && 
                (
                    gene_id_column !== this.#parameters.gene_id_column || 
                    (!use_reference_mito && mito_prefix !== this.#parameters.mito_prefix) ||
                    (
                        use_reference_mito && 
                        (
                            utils.changedParameters(species, this.#parameters.species) || 
                            gene_id_type !== this.#parameters.gene_id_type
                        )
                    )
                )
            ) 
        ) {
            utils.freeCache(this.#cache.metrics);

            if (this.valid()) {
                let gene_id_column2 = gene_id_column;
                let species2 = species;
                let gene_id_type2 = gene_id_type;

                if (guess_ids) {
                    let guesses = this.#inputs.guessRnaFeatureTypes();
                    let backcomp = RnaQualityControlState.configureFeatureParameters(use_reference_mito, guesses);
                    gene_id_column2 = backcomp.gene_id_column;
                    species2 = backcomp.species;
                    gene_id_type2 = backcomp.gene_id_type;
                }

                var gene_info = this.#inputs.fetchFeatureAnnotations()["RNA"];
                let val = (gene_id_column2 == null ? gene_info.rowNames() : gene_info.column(gene_id_column2));
                var subsets = utils.allocateCachedArray(gene_info.numberOfRows(), "Uint8Array", this.#cache, "metrics_buffer");
                subsets.fill(0);

                if (val !== null) {
                    if (use_reference_mito) {
                        let lists = await this.#acquire_reference(species2, gene_id_type2);
                        var sub_arr = subsets.array();
                        val.forEach((x, i) => {
                            if (lists.has(x)) {
                                sub_arr[i] = 1;
                            }
                        });
                    } else if (mito_prefix !== null) {
                        var lower_mito = mito_prefix.toLowerCase();
                        var sub_arr = subsets.array();
                        val.forEach((x, i) => {
                            if(x.toLowerCase().startsWith(lower_mito)) {
                                sub_arr[i] = 1;
                            }
                        });
                    }
                }

                var mat = this.#inputs.fetchCountMatrix().get("RNA");
                this.#cache.metrics = scran.perCellRnaQcMetrics(mat, [subsets]);
                this.changed = true;
            } else {
                delete this.#cache.metrics;
            }
        }

        this.#parameters.guess_ids = guess_ids;
        this.#parameters.gene_id_column = gene_id_column;
        this.#parameters.use_reference_mito = use_reference_mito;
        this.#parameters.species = bioc.CLONE(species); // avoid pass-by-reference behavior.
        this.#parameters.gene_id_type = gene_id_type;
        this.#parameters.mito_prefix = mito_prefix;

        if (this.changed || 
            filter_strategy !== this.#parameters.filter_strategy ||
            nmads !== this.#parameters.nmads ||
            sum_threshold !== this.#parameters.sum_threshold ||
            detected_threshold !== this.#parameters.detected_threshold ||
            mito_threshold !== this.#parameters.mito_threshold
        ) {
            utils.freeCache(this.#cache.filters);

            if (this.valid()) {
                let block = this.#inputs.fetchBlock();

                if (filter_strategy === "automatic") {
                    this.#cache.filters = scran.suggestRnaQcFilters(this.#cache.metrics, { numberOfMADs: nmads, block: block });
                } else if (filter_strategy === "manual") {
                    let block_levels = this.#inputs.fetchBlockLevels();
                    this.#cache.filters = scran.emptySuggestRnaQcFiltersResults(1, block_levels === null ? 1 : block_levels.length);
                    this.#cache.filters.thresholdsSums({ copy: false }).fill(sum_threshold);
                    this.#cache.filters.thresholdsDetected({ copy: false }).fill(detected_threshold);
                    this.#cache.filters.thresholdsSubsetProportions(0, { copy: false }).fill(mito_threshold);
                } else {
                    throw new Error("unknown RNA QC filtering strategy '" + filter_strategy + "'");
                }

                var discard = utils.allocateCachedArray(this.#cache.metrics.numberOfCells(), "Uint8Array", this.#cache, "discard_buffer");
                this.#cache.filters.filter(this.#cache.metrics, { block: block, buffer: discard });
                this.changed = true;
            } else {
                delete this.#cache.filters;
            }

            this.#parameters.filter_strategy = filter_strategy;
            this.#parameters.nmads = nmads;
            this.#parameters.sum_threshold = sum_threshold;
            this.#parameters.detected_threshold = detected_threshold;
            this.#parameters.mito_threshold = mito_threshold;
        }

        return;
    }
}