Source code for cellhint.harmonize

from anndata import AnnData
import scanpy
from typing import Union, Optional
import pandas as pd
import numpy as np
from .distance import Distance
from .distances import Distances
from .align import DistanceAlignment
from . import logger


[docs]
def harmonize(adata: AnnData,
              #dataset & cell type info
              dataset: str, cell_type: str, use_rep: Optional[str] = None, metric: Optional[str] = None,
              #PCT the distances or directly calculate
              use_pct: bool = False,
              #filter and normalize; note PCT will normalize after prediction
              filter_cells: bool = False, normalize: bool = True, Gaussian_kernel: bool = False,
              #PCT train
              F_test_prune: bool = True, p_thres: float = 0.05, random_state: int = 2,
              #order of datasets
              dataset_order: Optional[Union[list, tuple, np.ndarray, pd.Series, pd.Index]] = None, reorder_dataset: bool = True,
              #align params
              minimum_unique_percents: Union[list, tuple, np.ndarray, pd.Series, pd.Index, float] = (0.4, 0.5, 0.6, 0.7, 0.8),
              minimum_divide_percents: Union[list, tuple, np.ndarray, pd.Series, pd.Index, float] = (0.1, 0.15, 0.2),
              maximum_novel_percent: float = 0.05,
              #reannotate
              reannotate: bool = True, prefix: str = '',
              #to PCT train
              **kwargs) -> DistanceAlignment:
    """
    PCT-based cell type harmonization across datasets/batches.

    Parameters
    ----------
    adata
        An :class:`~anndata.AnnData` object containing different datasets/batches and cell types.
        In most scenarios, the format of the expression `.X` in the AnnData is flexible (normalized, log-normalized, z-scaled, etc.).
        However, when `use_rep` is specified as `'X'` (or `X_pca` is not detected in `.obsm` and no other latent representations are provided), `.X` should be log-normalized (to a constant total count per cell).
    dataset
        Column name (key) of cell metadata specifying dataset/batch information.
    cell_type
        Column name (key) of cell metadata specifying cell type information.
    use_rep
        Representation used to calculate distances. This can be `'X'` or any representations stored in `.obsm`.
        Default to the PCA coordinates if present (if not, use the expression matrix `X`).
    metric
        Metric to calculate the distance between each cell and each cell type. Can be `'euclidean'`, `'cosine'`, `'manhattan'` or any metrics applicable to :func:`sklearn.metrics.pairwise_distances`.
        Default to `'euclidean'` if latent representations are used for calculating distances, and to `'correlation'` if the expression matrix is used.
    use_pct
        Whether to use a predictive clustering tree to infer cross-dataset cell type distances.
        Setting to `True` will calculate distances based on PCT, which is intended for datasets with large batch effects.
        (Default: `False`)
    filter_cells
        Whether to filter out cells whose gene expression profiles do not correlate most with the eigen cell they belong to (i.e., correlate most with other cell types).
        Setting to `True` will speed up the run as only a subset of cells are used, but will render the remaining cells (i.e., filtered cells) unannotated (see the `reannotate` argument).
        (Default: `False`)
    normalize
        Whether to normalize the distance matrix if `use_pct = False` (or normalize the predicted distance if `use_pct = True`).
        (Default: `True`)
    Gaussian_kernel
        Whether to apply the Gaussian kernel to the distance matrix.
        (Default: `False`)
    F_test_prune
        Whether to use a F-test to prune the tree by removing unnecessary splits.
        (Default: `True`)
    p_thres
        p-value threshold for pruning nodes after F-test.
        (Default: `0.05`)
    random_state
        Random seed for feature shuffling during PCT training.
        (Default: `2`)
    dataset_order
        Order of datasets to be aligned. If this argument is specified, `reorder_dataset` is ignored.
        Default to the order in the distance matrix (alphabetical order in most cases) if `reorder_dataset = False`.
    reorder_dataset
        Whether to reorder datasets based on their pairwise similarities.
        (Default: `True`)
    minimum_unique_percents
        The minimum cell assignment fraction(s) to claim a cell type as uniquely matched to a cell type from the other dataset.
        By default, five values will be tried (0.4, 0.5, 0.6, 0.7, 0.8) to find the one that produces least alignments in each harmonization iteration.
    minimum_divide_percents
        The minimum cell assignment fraction(s) to claim a cell type as divisible into two or more cell types from the other dataset.
        By default, three values will be tried (0.1, 0.15, 0.2) to find the one that produces least alignments in each harmonization iteration.
    maximum_novel_percent
        The maximum cell assignment fraction to claim a cell type as novel to a given dataset.
        (Default: `0.05`)
    reannotate
        Whether to reannotate cells into harmonized cell types.
        (Default: `True`)
    prefix
        Column prefix for the reannotation data frame.
    **kwargs
        Other keyword arguments passed to :class:`~cellhint.pct.PredictiveClusteringTree`.

    Returns
    ----------
    DistanceAlignment
        A :class:`~cellhint.align.DistanceAlignment` object. Four important attributes within this class are:
        1) :attr:`~cellhint.align.DistanceAlignment.base_distance`, cross-dataset distances between all cells and all cell types.
        2) :attr:`~cellhint.align.DistanceAlignment.relation`, the harmonization table.
        3) :attr:`~cellhint.align.DistanceAlignment.groups`, high-hierarchy cell types categorizing rows of the harmonization table.
        4) :attr:`~cellhint.align.DistanceAlignment.reannotation`, reannotated cell types and cell type groups.
    """
    #raw counts are not allowed to build trees
    if use_pct and adata.X[:1000].min() >= 0 and float(adata.X[:1000].max()).is_integer():
        raise ValueError(
                f"🛑 `.X` of the AnnData is detected to be raw counts, which is not suitable for building PCT")
    #build PCT using all genes is not realistic
    if use_pct and adata.n_vars > 15000:
        logger.warn(f"⚠️ Warning: {adata.n_vars} features are used and may take long time for building PCT. Subsetting the AnnData into HVGs is recommended")
    #generate a combined `Distance`
    if use_pct:
        separate_distances = Distances(adata, dataset = dataset, cell_type = cell_type, use_rep = use_rep, metric = metric, n_jobs = -1)
        if filter_cells:
            separate_distances.filter_cells(check_symmetry = False)
        separate_distances.train(F_test_prune = F_test_prune, p_thres = p_thres, random_state = random_state, **kwargs)
        combined_distance = separate_distances.predict(normalize = normalize, return_distance = True, Gaussian_kernel = Gaussian_kernel)
    else:
        combined_distance = Distance.from_adata(adata, dataset = dataset, cell_type = cell_type, use_rep = use_rep, metric = metric, n_jobs = -1, check_params = True)
        if filter_cells:
            combined_distance.filter_cells(check_symmetry = False)
        if normalize:
            combined_distance.normalize(Gaussian_kernel = Gaussian_kernel, rank = True, normalize = True)
    #before cell type alignment
    combined_distance.assign()
    alignment = DistanceAlignment(combined_distance, check = False, dataset_order = dataset_order, row_normalize = True, maximum_novel_percent = maximum_novel_percent)
    if dataset_order is None and reorder_dataset:
        logger.info(f"🏆 Reordering datasets")
        alignment.reorder_dataset()
    #cell type alignment
    alignment.best_align(dataset_order = None, minimum_unique_percents = minimum_unique_percents, minimum_divide_percents = minimum_divide_percents)
    #reannotate
    if reannotate:
        logger.info(f"🖋️ Reannotating cells")
        alignment.reannotate(prefix = prefix)
    logger.info(f"✅ Harmonization done!")
    #return
    return alignment


harmonise = harmonize