readers/utils/extract.js

import * as pako from "pako";
import ppp from "papaparse";
import * as astream from "./abstract/stream.js";
import * as afile from "../abstract/file.js";
import * as scran from "scran.js";

export function extractHdf5Strings(handle, name) {
    if (!(name in handle.children)) {
        return null;
    }

    if (handle.children[name] !== "DataSet") {
        return null;
    }

    let content = handle.open(name);
    if (!(content.type instanceof scran.H5StringType)) {
        return null;
    }

    return content.load();
}

/**
 * Summarize an array, typically corresponding to a single column of per-cell annotation.
 * This can be used as part of a preflight response in a Reader.
 *
 * @param {Array|TypedArray} array - Per-cell annotation array of length equal to the number of cells for a given matrix.
 * An Array is treated as categorical data and should contain strings, while TypedArrays are treated as continuous data.
 * @param {object} [options] - Optional parameters.
 * @param {number} [options.limit=50] - Maximum number of unique values to report for categorical `x`.
 *
 * @return {object} Object containing `type`, a string indicating whether `array` was categorical or continuous.
 *
 * If `"categorical"`, the object will contain `values`, an array of unique values up to the length specified by `limit`.
 * It will also contain `truncated`, a boolean indicating whether the actual number of unique values exceeds `limit`.
 *
 * If `"continuous"`, the object will contain the numbers `min` and `max` specifying the minimum and maximum value in `x`, respectively.
 * `min` or `max` may be negative or positive infinity, respectively, if there is no bound on one or both ends.
 * If `min > max`, all values in `array` are `NaN`s such that no bound can be found.
 */
export function summarizeArray(array, { limit = 50 } = {}) {
    if (array instanceof Array) {
        let chosen = Array.from(new Set(array));
        chosen.sort();
        let truncated = false;
        if (chosen.length > limit) {
            chosen = chosen.slice(0, limit);
            truncated = true;
        }
        return {
            "type": "categorical",
            "values": chosen,
            "truncated": truncated
        };
    } else {
        let min = Number.POSITIVE_INFINITY, max = Number.NEGATIVE_INFINITY;
        array.forEach(x => {
            if (x < min) {
                min = x;
            }
            if (x > max) {
                max = x;
            }
        });

        return { 
            "type": "continuous",
            "min": min, 
            "max": max 
        };
    }
}

function guess_compression(x, compression) {
    if (compression !== null) {
        return compression;
    }

    let buffer;
    if (x instanceof Uint8Array) {
        buffer = x;
    } else {
        buffer = astream.peek(x, 3);
    }

    // Compare against magic words for auto-detection.
    if (buffer.length >= 3 && buffer[0] == 0x1F && buffer[1] == 0x8B && buffer[2] == 0x08) {
        return 'gz';
    }

    return 'none';
}

export function unpackText(buffer, { compression = null } = {}) {
    compression = guess_compression(buffer, compression);
    let txt = (compression === "gz" ? pako.ungzip(buffer) : buffer);
    const dec = new TextDecoder();
    return dec.decode(txt);
}

// Soft-deprecated as of 1.1.0.
export function readLines(buffer, { compression = null } = {}) {
    let decoded = unpackText(buffer, { compression: compression });
    let lines = decoded.split("\n");
    if (lines.length > 0 && lines[lines.length - 1] == "") { // ignoring the trailing newline.
        lines.pop();
    }
    return lines;    
}

function merge_bytes(leftovers, decoder) {
    let total = 0;
    for (const x of leftovers) {
        total += x.length;
    }

    let combined = new Uint8Array(total);
    total = 0;
    for (const x of leftovers) {
        combined.set(x, total);
        total += x.length;
    }

    return decoder.decode(combined);
}

async function stream_callback(x, compression, chunkSize, callback) {
    // Force the input to be either a Uint8Array or a file path string.
    if (typeof x == "string") {
        ;
    } else if (x instanceof Uint8Array) {
        ;
    } else if (x instanceof afile.SimpleFile) {
        x = x.content();
    } else {
        x = (new afile.SimpleFile(x, { name: "dummy" })).content();
    }

    if (guess_compression(x, compression) == "gz") {
        await (new Promise((resolve, reject) => {
            let gz = new pako.Inflate({ chunkSize: chunkSize });
            gz.onData = callback;
            gz.onEnd = status => {
                if (status) {
                    reject("gzip decompression failed; " + gz.msg);
                } else {
                    resolve(null);
                }
            };

            if (typeof x == "string") {
                astream.stream(x, chunkSize, chunk => gz.push(chunk), null, reject);
            } else {
                gz.push(x);
            }
        }));
        return;
    }

    // Remaining possibilities are uncompressed.
    if (typeof x == "string") {
        await (new Promise((resolve, reject) => astream.stream(x, chunkSize, callback, resolve, reject)));
        return;
    }

    callback(x);
    return;
}

/**
 * Read lines of text from a file, possibly with decompression.
 *
 * @param {string|Uint8Array|SimpleFile|File} x - Contents of the file to be read.
 * On Node.js, this may be a string containing a path to a file;
 * on browsers, this may be a File object.
 * @param {object} [options={}] - Optional parameters.
 * @param {?string} [options.compression=null] - Compression of `buffer`, either `"gz"` or `"none"`.
 * If `null`, it is determined automatically from the `buffer` header.
 * @param {number} [options.chunkSize=65536] - Chunk size in bytes to use for file reading (if `x` is a file path) and decompression (if `compression="gz"`).
 * Larger values improve speed at the cost of memory.
 *
 * @return {Array} Array of strings where each entry contains a line in `buffer`.
 * The newline itself is not included in each string.
 * @async 
 */
export async function readLines2(x, { compression = null, chunkSize = 65536 } = {}) {
    const dec = new TextDecoder;
    let leftovers = [];
    let lines = [];

    let callback = (chunk) => {
        let last = 0;
        for (var i = 0; i < chunk.length; i++) {
            if (chunk[i] == 10) { // i.e., ASCII newline.
                let current = chunk.subarray(last, i);
                if (leftovers.length) {
                    leftovers.push(current);
                    lines.push(merge_bytes(leftovers, dec));
                    leftovers = [];
                } else {
                    lines.push(dec.decode(current));
                }
                last = i + 1; // skip past the newline.
            }
        }

        if (last != chunk.length) {
            leftovers.push(chunk.slice(last)); // copy to avoid problems with ownership as chunk gets deref'd.
        }
    };

    await stream_callback(x, compression, chunkSize, callback);

    if (leftovers.length) {
        lines.push(merge_bytes(leftovers, dec));
    }

    return lines;    
}

// Soft-deprecated as of 1.1.0.
export function readTable(buffer, { compression = null, delim = "\t", firstOnly = false } = {}) {
    let decoded = unpackText(buffer, { compression: compression });
    let res = ppp.parse(decoded, { delimiter: delim, preview: (firstOnly ? 1 : 0) });

    // Handle terminating newlines.
    let last = res.data[res.data.length - 1];
    if (last.length === 1 && last[0] === "") {
        res.data.pop();
    }

    return res.data;
}

/**
 * Read a delimiter-separated table from a buffer, possibly with decompression.
 * This assumes that newlines represent the end of each row of the table, i.e., there cannot be newlines inside quoted strings.
 *
 * @param {string|Uint8Array|SimpleFile|File} x - Contents of the file to be read.
 * On Node.js, this may be a string containing a path to a file;
 * on browsers, this may be a File object.
 * @param {object} [options={}] - Optional parameters.
 * @param {?string} [options.compression=null] - Compression of `buffer`, either `"gz"` or `"none"`.
 * If `null`, it is determined automatically from the `buffer` header.
 * @param {string} [options.delim="\t"] - Delimiter between fields.
 * @param {number} [options.chunkSize=1048576] - Chunk size in bytes to use for file reading (if `x` is a path), parsing of rows, and decompression (if `compression="gz"`).
 * Larger values improve speed at the cost of memory.
 *
 * @return {Array} Array of length equal to the number of lines in `buffer`.
 * Each entry is an array of strings, containing the `delim`-separated fields for its corresponding line.
 *
 * @async
 */
export async function readTable2(x, { compression = null, delim = "\t", chunkSize = 1048576 } = {}) {
    const dec = new TextDecoder;

    let rows = [];
    let parse = (str) => {
        let out = ppp.parse(str, { delimiter: delim });
        if (out.meta.aborted) {
            let msg = "failed to parse delimited file";
            for (const e of out.errors) {
                msg += "; " + e.message;
            }
            throw new Error(msg);
        }
        for (const x of out.data) {
            rows.push(x);
        }
    };

    let leftovers = [];
    let size_left = 0;
    let callback = (chunk) => {
        let last = 0;
        for (var i = 0; i < chunk.length; i++) {
            // We assume that all newlines are end-of-rows, i.e., there are no
            // newlines inside quoted strings. Under this assumption, we can
            // safely chunk the input stream based on newlines, parse each
            // chunk, and then combine the parsing results together. To avoid
            // too many parsing calls, we accumulate buffers until we hit 
            // the chunkSize and then we decode + parse them altogether.
            if (chunk[i] == 10 && (i - last) + size_left >= chunkSize) {
                let current = chunk.subarray(last, i);
                if (leftovers.length) {
                    leftovers.push(current);
                    parse(merge_bytes(leftovers, dec));
                    leftovers = [];
                } else {
                    parse(dec.decode(current));
                }
                last = i + 1; // skip past the newline.
                size_left = 0;
            }
        }

        if (last != chunk.length) {
            leftovers.push(chunk.slice(last)); // copy to avoid problems with ownership as chunk gets deref'd.
            size_left += chunk.length - last;
        }
    };

    await stream_callback(x, compression, chunkSize, callback);

    if (leftovers.length) {
        let combined = merge_bytes(leftovers, dec);
        parse(combined);
        if (combined[combined.length - 1] == "\n") { // guaranteed to have non-zero length, by virtue of how 'leftovers' is filled.
            rows.pop();            
        }
    }

    return rows;    
}

/**
 * Detect if an array contains only stringified numbers and, if so, convert it into a TypedArray.
 * Conversion will still be performed for non-number strings corresponding to missing values or explicit not-a-number entries.
 *
 * @param {Array} x Array of strings, usually corresponding to a column in a table read by {@linkcode readDSVFromBuffer}.
 *
 * @return {?Float64Array} A Float64Array is returned if `x` contains stringified numbers.
 * Otherwise, `null` is returned if the conversion could not be performed.
 */
export function promoteToNumber(x) {
    let as_num = new Float64Array(x.length);

    for (const [i, v] of Object.entries(x)) {
        // See discussion at https://stackoverflow.com/questions/175739/how-can-i-check-if-a-string-is-a-valid-number.
        let opt1 = Number(v);
        let opt2 = parseFloat(v);
        if (!isNaN(opt1) && !isNaN(opt2)) {
            as_num[i] = opt1;
        } else if (v === "" || v === "NA" || v == "na" || v == "NaN" || v == "nan") {
            as_num[i] = NaN;
        } else if (v == "Inf" || v == "inf") {
            as_num[i] = Number.POSITIVE_INFINITY;
        } else if (v == "-Inf" || v == "-inf") {
            as_num[i] = Number.NEGATIVE_INFINITY;
        } else {
            return null;
        }
    }

    return as_num;
}