from anndata import AnnData
from typing import Union, Optional
import pandas as pd
import numpy as np
from .distance import Distance
from .align import DistanceAlignment
from . import logger
[docs]
def selfmatch(X: Union[AnnData, pd.DataFrame],
columns: Union[list, tuple, np.ndarray, pd.Series, pd.Index],
calculate_distance: bool = False,
use_rep: Optional[str] = None, metric: Optional[str] = None,
normalize: bool = True, Gaussian_kernel: bool = False,
minimum_unique_percents: Union[list, tuple, np.ndarray, pd.Series, pd.Index, float] = (0.4, 0.5, 0.6, 0.7, 0.8),
minimum_divide_percents: Union[list, tuple, np.ndarray, pd.Series, pd.Index, float] = (0.1, 0.15, 0.2),
reannotate: bool = True, prefix: str = '') -> DistanceAlignment:
"""
Match different versions of cell type annotations (e.g., different resolutions of clustering) for cells from a single dataset.
Parameters
----------
X
An :class:`~anndata.AnnData` or :class:`~pandas.DataFrame` object containing information of different cell type annotations as multiple columns of cell metadata.
columns
Column names (keys) of cell metadata representing cell type annotations or clusterings.
calculate_distance
Whether to calculate the cell-by-cell-type distance matrix. This is usually not necessary as all annotations are in place for a single dataset.
(Default: `False`)
use_rep
Representation used to calculate distances. This can be `'X'` or any representations stored in `.obsm`.
This argument will be ignored when `calculate_distance = False` (the default).
Default to the PCA coordinates if present (if not, use the expression matrix `X`).
metric
Metric to calculate the distance between each cell and each cell type. Can be `'euclidean'`, `'cosine'`, `'manhattan'` or any metrics applicable to :func:`sklearn.metrics.pairwise_distances`.
This argument will be ignored when `calculate_distance = False` (the default).
Default to `'euclidean'` if latent representations are used for calculating distances, and to `'correlation'` if the expression matrix is used.
normalize
Whether to normalize the distance matrix.
This argument will be ignored when `calculate_distance = False` (the default).
(Default: `True`)
Gaussian_kernel
Whether to apply the Gaussian kernel to the distance matrix.
This argument will be ignored when `calculate_distance = False` (the default).
(Default: `False`)
minimum_unique_percents
The minimum cell assignment fraction(s) to claim two cell types as uniquely matched.
By default, five values will be tried (0.4, 0.5, 0.6, 0.7, 0.8) to find the one that produces least alignments in each harmonization iteration.
minimum_divide_percents
The minimum cell assignment fraction(s) to claim a cell type as divisible into two or more cell types.
By default, three values will be tried (0.1, 0.15, 0.2) to find the one that produces least alignments in each harmonization iteration.
reannotate
Whether to reannotate cells into harmonized cell types.
(Default: `True`)
prefix
Column prefix for the reannotation data frame.
Returns
----------
DistanceAlignment
A :class:`~cellhint.align.DistanceAlignment` object. Four important attributes within this class are:
1) :attr:`~cellhint.align.DistanceAlignment.base_distance`, within-dataset distances between all cells and all cell types.
2) :attr:`~cellhint.align.DistanceAlignment.relation`, the harmonization table.
3) :attr:`~cellhint.align.DistanceAlignment.groups`, high-hierarchy cell types categorizing rows of the harmonization table.
4) :attr:`~cellhint.align.DistanceAlignment.reannotation`, reannotated cell types and cell type groups.
"""
#input
if not isinstance(X, (AnnData, pd.DataFrame)):
raise TypeError(
f"🛑 Please provide a correct input - an `anndata.AnnData` or `pandas.DataFrame`")
df = X.obs if isinstance(X, AnnData) else X
#columns
if isinstance(columns, str) or len(columns) == 1:
raise TypeError(
f"🛑 Please provide at least two columns")
columns = np.array(columns)
non_columns = set(columns).difference(df.columns)
if len(non_columns) >= 1:
raise ValueError(
f"🛑 The following column(s) are not found: {non_columns}")
#meta
cell = pd.concat([pd.DataFrame(dict(dataset = column, ID = df.index, cell_type = df[column].astype(str))) for column in np.unique(columns)], axis = 0, ignore_index = True)
cell['ID'] = cell.dataset + '__' + cell.ID
cell_type = pd.concat([pd.DataFrame(dict(dataset = column, cell_type = np.unique(df[column]))) for column in np.unique(columns)], axis = 0, ignore_index = True)
#dist mat
if calculate_distance and not isinstance(X, AnnData):
raise TypeError(
f"🛑 To calculate the distance, please provide an AnnData as input")
if calculate_distance:
distances = []
for column in np.unique(columns):
X.obs['__dataset__'] = '__constant__'
distances.append(Distance.from_adata(X, dataset = '__dataset__', cell_type = column, use_rep = use_rep, metric = metric, n_jobs = -1, check_params = True))
distance = distances[0].concatenate(distances[1:], by = 'cell_type', check = False)
if normalize:
distance.normalize(Gaussian_kernel = Gaussian_kernel, rank = True, normalize = True)
combined_distance = distance.concatenate([distance] * (len(columns) - 1), by = 'cell', check = False)
dist_mat = combined_distance.dist_mat
X.obs.drop(columns = ['__dataset__'], inplace = True)
else:
dist_mat = np.zeros((cell.shape[0], cell_type.shape[0]))
#distance
combined_distance = Distance(dist_mat, cell, cell_type)
#assignment
combined_distance.assignment = pd.concat([pd.concat([df[column].astype(str) for column in np.unique(columns)], axis = 1)] * len(columns), axis = 0)
combined_distance.assignment.index = combined_distance.cell.index
combined_distance.assignment.columns = np.unique(columns)
#alignment
alignment = DistanceAlignment(combined_distance, check = False, dataset_order = columns, row_normalize = True, maximum_novel_percent = -1)
alignment.best_align(dataset_order = None, minimum_unique_percents = minimum_unique_percents, minimum_divide_percents = minimum_divide_percents)
if reannotate:
logger.info(f"🖋️ Reannotating cells")
alignment.reannotate(prefix = prefix)
logger.info(f"✅ Harmonization done!")
#return
return alignment