from anndata import AnnData
import scanpy
from typing import Union, Optional
import pandas as pd
import numpy as np
from .distance import Distance
from .distances import Distances
from .align import DistanceAlignment
from . import logger
[docs]
def harmonize(adata: AnnData,
#dataset & cell type info
dataset: str, cell_type: str, use_rep: Optional[str] = None, metric: Optional[str] = None,
#PCT the distances or directly calculate
use_pct: bool = False,
#filter and normalize; note PCT will normalize after prediction
filter_cells: bool = False, normalize: bool = True, Gaussian_kernel: bool = False,
#PCT train
F_test_prune: bool = True, p_thres: float = 0.05, random_state: int = 2,
#order of datasets
dataset_order: Optional[Union[list, tuple, np.ndarray, pd.Series, pd.Index]] = None, reorder_dataset: bool = True,
#align params
minimum_unique_percents: Union[list, tuple, np.ndarray, pd.Series, pd.Index, float] = (0.4, 0.5, 0.6, 0.7, 0.8),
minimum_divide_percents: Union[list, tuple, np.ndarray, pd.Series, pd.Index, float] = (0.1, 0.15, 0.2),
maximum_novel_percent: float = 0.05,
#reannotate
reannotate: bool = True, prefix: str = '',
#to PCT train
**kwargs) -> DistanceAlignment:
"""
PCT-based cell type harmonization across datasets/batches.
Parameters
----------
adata
An :class:`~anndata.AnnData` object containing different datasets/batches and cell types.
In most scenarios, the format of the expression `.X` in the AnnData is flexible (normalized, log-normalized, z-scaled, etc.).
However, when `use_rep` is specified as `'X'` (or `X_pca` is not detected in `.obsm` and no other latent representations are provided), `.X` should be log-normalized (to a constant total count per cell).
dataset
Column name (key) of cell metadata specifying dataset/batch information.
cell_type
Column name (key) of cell metadata specifying cell type information.
use_rep
Representation used to calculate distances. This can be `'X'` or any representations stored in `.obsm`.
Default to the PCA coordinates if present (if not, use the expression matrix `X`).
metric
Metric to calculate the distance between each cell and each cell type. Can be `'euclidean'`, `'cosine'`, `'manhattan'` or any metrics applicable to :func:`sklearn.metrics.pairwise_distances`.
Default to `'euclidean'` if latent representations are used for calculating distances, and to `'correlation'` if the expression matrix is used.
use_pct
Whether to use a predictive clustering tree to infer cross-dataset cell type distances.
Setting to `True` will calculate distances based on PCT, which is intended for datasets with large batch effects.
(Default: `False`)
filter_cells
Whether to filter out cells whose gene expression profiles do not correlate most with the eigen cell they belong to (i.e., correlate most with other cell types).
Setting to `True` will speed up the run as only a subset of cells are used, but will render the remaining cells (i.e., filtered cells) unannotated (see the `reannotate` argument).
(Default: `False`)
normalize
Whether to normalize the distance matrix if `use_pct = False` (or normalize the predicted distance if `use_pct = True`).
(Default: `True`)
Gaussian_kernel
Whether to apply the Gaussian kernel to the distance matrix.
(Default: `False`)
F_test_prune
Whether to use a F-test to prune the tree by removing unnecessary splits.
(Default: `True`)
p_thres
p-value threshold for pruning nodes after F-test.
(Default: `0.05`)
random_state
Random seed for feature shuffling during PCT training.
(Default: `2`)
dataset_order
Order of datasets to be aligned. If this argument is specified, `reorder_dataset` is ignored.
Default to the order in the distance matrix (alphabetical order in most cases) if `reorder_dataset = False`.
reorder_dataset
Whether to reorder datasets based on their pairwise similarities.
(Default: `True`)
minimum_unique_percents
The minimum cell assignment fraction(s) to claim a cell type as uniquely matched to a cell type from the other dataset.
By default, five values will be tried (0.4, 0.5, 0.6, 0.7, 0.8) to find the one that produces least alignments in each harmonization iteration.
minimum_divide_percents
The minimum cell assignment fraction(s) to claim a cell type as divisible into two or more cell types from the other dataset.
By default, three values will be tried (0.1, 0.15, 0.2) to find the one that produces least alignments in each harmonization iteration.
maximum_novel_percent
The maximum cell assignment fraction to claim a cell type as novel to a given dataset.
(Default: `0.05`)
reannotate
Whether to reannotate cells into harmonized cell types.
(Default: `True`)
prefix
Column prefix for the reannotation data frame.
**kwargs
Other keyword arguments passed to :class:`~cellhint.pct.PredictiveClusteringTree`.
Returns
----------
DistanceAlignment
A :class:`~cellhint.align.DistanceAlignment` object. Four important attributes within this class are:
1) :attr:`~cellhint.align.DistanceAlignment.base_distance`, cross-dataset distances between all cells and all cell types.
2) :attr:`~cellhint.align.DistanceAlignment.relation`, the harmonization table.
3) :attr:`~cellhint.align.DistanceAlignment.groups`, high-hierarchy cell types categorizing rows of the harmonization table.
4) :attr:`~cellhint.align.DistanceAlignment.reannotation`, reannotated cell types and cell type groups.
"""
#raw counts are not allowed to build trees
if use_pct and adata.X[:1000].min() >= 0 and float(adata.X[:1000].max()).is_integer():
raise ValueError(
f"🛑 `.X` of the AnnData is detected to be raw counts, which is not suitable for building PCT")
#build PCT using all genes is not realistic
if use_pct and adata.n_vars > 15000:
logger.warn(f"⚠️ Warning: {adata.n_vars} features are used and may take long time for building PCT. Subsetting the AnnData into HVGs is recommended")
#generate a combined `Distance`
if use_pct:
separate_distances = Distances(adata, dataset = dataset, cell_type = cell_type, use_rep = use_rep, metric = metric, n_jobs = -1)
if filter_cells:
separate_distances.filter_cells(check_symmetry = False)
separate_distances.train(F_test_prune = F_test_prune, p_thres = p_thres, random_state = random_state, **kwargs)
combined_distance = separate_distances.predict(normalize = normalize, return_distance = True, Gaussian_kernel = Gaussian_kernel)
else:
combined_distance = Distance.from_adata(adata, dataset = dataset, cell_type = cell_type, use_rep = use_rep, metric = metric, n_jobs = -1, check_params = True)
if filter_cells:
combined_distance.filter_cells(check_symmetry = False)
if normalize:
combined_distance.normalize(Gaussian_kernel = Gaussian_kernel, rank = True, normalize = True)
#before cell type alignment
combined_distance.assign()
alignment = DistanceAlignment(combined_distance, check = False, dataset_order = dataset_order, row_normalize = True, maximum_novel_percent = maximum_novel_percent)
if dataset_order is None and reorder_dataset:
logger.info(f"🏆 Reordering datasets")
alignment.reorder_dataset()
#cell type alignment
alignment.best_align(dataset_order = None, minimum_unique_percents = minimum_unique_percents, minimum_divide_percents = minimum_divide_percents)
#reannotate
if reannotate:
logger.info(f"🖋️ Reannotating cells")
alignment.reannotate(prefix = prefix)
logger.info(f"✅ Harmonization done!")
#return
return alignment
harmonise = harmonize