clusterKmeans.js - Documentation

import * as utils from "./utils.js";
import * as gc from "./gc.js";
import { RunPcaResults } from "./runPca.js";

/**
 * Wrapper around the k-means clustering results on the Wasm heap, produced by {@linkcode clusterKmeans}.
 * @hideconstructor
 */
export class ClusterKmeansResults {
    #id;
    #results;

    constructor(id, raw) {
        this.#results = raw;
        this.#id = id;
    }

    /**
     * @return {number} Number of cells in the results.
     */
    numberOfCells() {
        return this.#results.num_obs();
    }

    /**
     * @return {number} Number of clusters in the results.
     */
    numberOfClusters() {
        return this.#results.num_clusters();
    }

    /**
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean|string} [options.copy=true] - Whether to copy the results from the Wasm heap, see {@linkcode possibleCopy}.
     * @return {Int32Array|Int32WasmArray} Array containing the cluster assignment for each cell.
     */
    clusters(options = {}) {
        const { copy = true, ...others } = options;
        utils.checkOtherOptions(others);
        return utils.possibleCopy(this.#results.clusters(), copy);
    }

    /**
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean|string} [options.copy=true] - Whether to copy the results from the Wasm heap, see {@linkcode possibleCopy}.
     * @return {Int32Array|Int32WasmArray} Array containing the number of cells in each cluster.
     */
    sizes(options = {}) {
        const { copy = true, ...others } = options;
        utils.checkOtherOptions(others);
        return utils.possibleCopy(this.#results.cluster_sizes(), copy);
    }

    /**
     * @param {object} [options={}] - Optional parameters.
     * @param {boolean|string} [options.copy=true] - Whether to copy the results from the Wasm heap, see {@linkcode possibleCopy}.
     * @return {Float64Array|Float64WasmArray} Array containing the cluster centers in column-major format,
     * where rows are dimensions and columns are the clusters.
     */
    centers(options = {}) {
        const { copy = true, ...others } = options;
        utils.checkOtherOptions(others);
        return utils.possibleCopy(this.#results.centers(), copy);
    }

    /**
     * @return {number} Number of refinement iterations performed by the algorithm.
     */
    iterations() {
        return this.#results.iterations();
    }

    /**
     * @return {number} Status of the algorithm - anything other than zero usually indicates a problem with convergence.
     */
    status() {
        return this.#results.status();
    }

    /**
     * @return Frees the memory allocated on the Wasm heap for this object.
     * This invalidates this object and all references to it.
     */
    free() {
        if (this.#results !== null) {
            gc.release(this.#id);
            this.#results = null;
        }
        return;
    }
}

/**
 * Cluster cells using k-means.
 * A variety of initialization and refinement algorithms can be used here, see the [**kmeans** documentation](https://github.com/LTLA/CppKmeans) for more details.
 *
 * @param {(RunPcaResults|Float64WasmArray|Array|TypedArray)} x - Numeric coordinates of each cell in the dataset.
 * For array inputs, this is expected to be in column-major format where the rows are the variables and the columns are the cells.
 * For a {@linkplain RunPcaResults} input, we extract the principal components.
 * @param {number} clusters Number of clusters to create.
 * This should not be greater than the number of cells.
 * @param {object} [options={}] - Optional parameters.
 * @param {?number} [options.numberOfDims=null] - Number of variables/dimensions per cell.
 * Only used (and required) for array-like `x`.
 * @param {?number} [options.numberOfCells=null] - Number of cells.
 * Only used (and required) for array-like `x`.
 * @param {string} [options.initMethod="pca-part"] - Initialization method.
 * Setting `"random"` will randomly select `clusters` cells as centers.
 * Setting `"kmeans++"` will use the weighted sampling approach of Arthur and Vassilvitskii (2007).
 * Setting `"var-part"` will use variance partitioning from Su and Dy (2007).
 * @param {number} [options.initSeed=5768] - Seed to use for random number generation during initialization.
 * @param {number} [options.initVarPartSizeAdjust=1] - Adjustment factor for the cluster sizes, used when `initMethod = "var-part"`.
 * Larger values (up to 1) will prioritize partitioning of clusters with more cells.
 * @param {boolean} [options.initVarPartOptimize=true] - Whether to optimize the partition at each step to minimize the sum of squares, when `initMethod = "var-part"`.
 * @param {string} [options.refineMethod="hartigan-wong"] - Refinement method.
 * This can be either `"hartigan-wong"` or `"lloyd"`.
 * @param {number} [options.refineLloydIterations=10] - Number of iterations for the Lloyd refinement algorithm.
 * @param {number} [options.refineHartiganWong=10] - Number of iterations for the Hartigan-Wong refinement algorithm.
 * @param {?number} [options.numberOfThreads=null] - Number of threads to use.
 * If `null`, defaults to {@linkcode maximumThreads}.
 *
 * @return {ClusterKmeansResults} Object containing the clustering results.
 */
export function clusterKmeans(x, clusters, options = {}) {
    let { 
        numberOfDims = null, 
        numberOfCells = null, 
        initMethod = "var-part", 
        initSeed = 5768, 
        initVarPartSizeAdjust = 1, 
        initVarPartOptimize = true, 
        refineMethod = "hartigan-wong",
        refineLloydIterations = 100,
        refineHartiganWongIterations = 10,
        numberOfThreads = null,
        ...others
    } = options;
    utils.checkOtherOptions(others);

    var buffer;
    var output;
    let nthreads = utils.chooseNumberOfThreads(numberOfThreads);

    try {
        let pptr;

        if (x instanceof RunPcaResults) {
            numberOfDims = x.numberOfPCs();
            numberOfCells = x.numberOfCells();
            let pcs = x.principalComponents({ copy: false });
            pptr = pcs.byteOffset;

        } else {
            if (numberOfDims === null || numberOfCells === null) {
                throw new Error("'numberOfDims' and 'numberOfCells' must be specified when 'x' is an Array");
            }

            buffer = utils.wasmifyArray(x, "Float64WasmArray");
            if (buffer.length != numberOfDims * numberOfCells) {
                throw new Error("length of 'x' must be the product of 'numberOfDims' and 'numberOfCells'");
            }

            pptr = buffer.offset;
        }

        output = gc.call(
            module => module.cluster_kmeans(
                pptr,
                numberOfDims,
                numberOfCells,
                clusters,
                initMethod,
                initSeed,
                initVarPartSizeAdjust,
                initVarPartOptimize,
                refineMethod,
                refineLloydIterations,
                refineHartiganWongIterations,
                nthreads
            ),
            ClusterKmeansResults
        );

    } catch (e) {
        utils.free(output);
        throw e;

    } finally {
        utils.free(buffer);
    }

    return output;
}