Source code for phippery.normalize

"""
=================
Normalize
=================

A set of useful functions for normalizing enrichments
in a phippery dataset.
"""

import numpy as np
from numpy.linalg import svd
import xarray as xr
import pandas as pd
import itertools
import copy

from phippery.utils import iter_groups
from phippery.utils import id_query
from phippery.utils import get_annotation_table


[docs]def standardized_enrichment( ds, lib_ds, beads_ds, data_table="counts", inplace=True, new_table_name="std_enrichment", ): """Compute standardized enrichment of sample counts. This is the *fold enrichment* of each sample's frequency compared to the library average frequency, minus the mock IP (beads only control) frequency. Note ---- Psuedo counts and exact calculation are derived from the Bloom Lab's formulated normalization heuristic for differential selection. See https://jbloomlab.github.io/dms_tools2/diffsel.html#id5 Parameters ---------- ds : xarray.DataSet The dataset you would like to fit to lib_ds : xarray.DataSet The dataset of phage library control samples used in normalization beads_ds : xarray.DataSet The dataset of beads only control samples used in normalization data_table : str The name of the enrichment layer you would like to fit mlxp to. new_table_name : str The name of the new layer you would like to append to the dataset. inplace : bool If True, then this function appends a dataArray to ds which is indexed with the same coordinate dimensions as 'data_table'. If False, a copy of ds is returned with the appended dataArray Returns ------- xarray.DataSet : If inplace is False, return a new DataSet object which has the enrichment values appended Example ------- Given a dataset, ``ds``, with the following samples and counts across 10 peptides: >>> phippery.get_annotation_table(ds, "sample") sample_metadata fastq_filename reference seq_dir sample_type sample_id 0 sample_0.fastq refa expa beads_only 1 sample_1.fastq refa expa beads_only 2 sample_2.fastq refa expa library 3 sample_3.fastq refa expa library 4 sample_4.fastq refa expa IP 5 sample_5.fastq refa expa IP 6 sample_6.fastq refa expa IP 7 sample_7.fastq refa expa IP 8 sample_8.fastq refa expa IP 9 sample_9.fastq refa expa IP >>> ds.counts <xarray.DataArray 'counts' (peptide_id: 5, sample_id: 10)> array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]) Coordinates: * sample_id (sample_id) int64 0 1 2 3 4 5 6 7 8 9 * peptide_id (peptide_id) int64 0 1 2 3 4 First, separate the dataset into it's various samples types using :func:`phippery.utils.ds_query` >>> ip_ds = ds_query(ds, "sample_type == 'IP'") >>> lib_ds = ds_query(ds, "sample_type == 'library'") >>> mockip_ds = ds_query(ds, "sample_type == 'beads_only'") We can then compute standardized fold enrichment like so: >>> phippery.normalize.standardized_enrichment(ip_ds, lib_ds, mockip_ds) which will modify the ``ip_ds`` dataset inplace to include a new table >>> ip_ds.std_enrichment <xarray.DataArray 'std_enrichment' (peptide_id: 5, sample_id: 6)> array([[0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.]]) Coordinates: * sample_id (sample_id) int64 4 5 6 7 8 9 * peptide_id (peptide_id) int64 0 1 2 3 4 Note that we expect the result to be all zeros because a 1-to-1 fold enrichment for ip's to library samples minus 1-to-1 beads to library """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) for control in [lib_ds, beads_ds]: if type(control) != xr.Dataset: raise ValueError( "ds_lib_control_indicies must be of type list, even if there is only a single value" ) std_enrichments = _comp_std_enr( counts=ds[data_table].to_pandas(), lib_counts=lib_ds[data_table].to_pandas(), mock_ip_counts=beads_ds[data_table].to_pandas(), ) if inplace: ds[new_table_name] = xr.DataArray(std_enrichments) return None else: ds_copy = copy.deepcopy(ds) ds_copy[new_table_name] = xr.DataArray(std_enrichments) return ds_copy
def _comp_std_enr(counts, lib_counts, mock_ip_counts): """Computes standardized enrichment.""" normalized_ds_counts = copy.deepcopy(counts) # find controls and average all ds_lib_counts_mean = lib_counts.mean(axis=1) ds_bead_counts_mean = mock_ip_counts.mean(axis=1) ds_lib_counts_mean_sum = sum(ds_lib_counts_mean) # compute beads control std enrichment pseudo_sample = ds_bead_counts_mean + max( 1, sum(ds_bead_counts_mean) / ds_lib_counts_mean_sum ) pseudo_lib_counts = ds_lib_counts_mean + max( 1, ds_lib_counts_mean_sum / sum(ds_bead_counts_mean) ) pseudo_sample_freq = pseudo_sample / sum(pseudo_sample) pseudo_lib_counts_freq = pseudo_lib_counts / sum(pseudo_lib_counts) pseudo_bead_enrichment = pseudo_sample_freq / pseudo_lib_counts_freq # compute all sample standardized enrichment for sample_id, sample in counts.items(): pseudo_sample = sample + max(1, sum(sample) / ds_lib_counts_mean_sum) pseudo_lib_counts = ds_lib_counts_mean + max( 1, ds_lib_counts_mean_sum / sum(sample) ) pseudo_sample_freq = pseudo_sample / sum(pseudo_sample) pseudo_lib_counts_freq = pseudo_lib_counts / sum(pseudo_lib_counts) sample_enrichment = pseudo_sample_freq / pseudo_lib_counts_freq normalized_ds_counts.loc[:, sample_id] = ( sample_enrichment - pseudo_bead_enrichment ) return normalized_ds_counts
[docs]def enrichment( ds, lib_ds, data_table="counts", inplace=True, new_table_name="enrichment", ): """This function computes fold enrichment in the same fashion as the **standardized_enrichment**, but does *not* subtract beads only controls Parameters ---------- ds : xarray.DataSet The dataset you would like to fit to lib_ds : xarray.DataSet The dataset of phage library control samples used in normalization data_table : str The name of the enrichment layer you would like to fit mlxp to. new_table_name : str The name of the new layer you would like to append to the dataset. inplace : bool If True, then this function appends a dataArray to ds which is indexed with the same coordinate dimensions as 'data_table'. If False, a copy of ds is returned with the appended dataArray Returns ------- xarray.DataSet : If inplace is False, return a new DataSet object which has the enrichment values appended """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) if type(lib_ds) != xr.Dataset: raise ValueError( "ds_lib_control_indicies must be of type list, even if there is only a single value" ) enrichments = _comp_enr( counts=ds[data_table].to_pandas(), lib_counts=lib_ds[data_table].to_pandas() ) if inplace: ds[new_table_name] = xr.DataArray(enrichments) return None else: ds_copy = copy.deepcopy(ds) ds_copy[new_table_name] = xr.DataArray(enrichments) return ds_copy
def _comp_enr(counts, lib_counts): """Compute enrichment values.""" # we are going to add an augmented counts matrix enrichments = copy.deepcopy(counts) # find controls and average all lib_counts_mean = lib_counts.mean(axis=1) lib_counts_mean_sum = sum(lib_counts_mean) # compute all sample standardized enrichment for sample_id, sample in enrichments.items(): pseudo_sample = sample + max(1, sum(sample) / lib_counts_mean_sum) pseudo_lib_control = lib_counts_mean + max(1, lib_counts_mean_sum / sum(sample)) pseudo_sample_freq = pseudo_sample / sum(pseudo_sample) pseudo_lib_control_freq = pseudo_lib_control / sum(pseudo_lib_control) sample_enrichment = pseudo_sample_freq / pseudo_lib_control_freq enrichments.loc[:, sample_id] = sample_enrichment return enrichments
[docs]def svd_rank_reduction( ds, rank=1, data_table="enrichment", inplace=True, new_table_name="svd_rr", ): r""" This function computes the singular value decomposition, then recomputes the enrichment matrix up to the rank specified. Concretely, given a Matrix of, :math:`X` enrichments in the `phippery` dataset with shape (peptides, samples). We compute the decomposition :math:`X = USV^{T}`, then return the recomposition using the first ``rank`` eigenvectors and eigenvaluesvalues. Parameters ---------- ds : xarray.DataSet The dataset you would like to fit to rank : int The number of ranks to include in the reconstruction. data_table : str The name of the enrichment layer you would like to fit mlxp to. new_table_name : str The name of the new layer you would like to append to the dataset. inplace : bool If True, then this function appends a dataArray to ds which is indexed with the same coordinate dimensions as 'data_table'. If False, a copy of ds is returned with the appended dataArray Returns ------- xarray.DataSet : If inplace is False, return a new DataSet object which has the enrichment values appended """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) low_rank_dt = copy.deepcopy(ds[data_table].to_pandas()) # compute rank reduction decomposition matrices U, S, V = svd(low_rank_dt.values) # Grab the first X outer products in the finite summation of rank layers. low_rank = U[:, :rank] @ np.diag(S[:rank]) @ V[:rank, :] low_rank_dt.loc[:, :] = low_rank svd_rr_approx = xr.DataArray(low_rank_dt, dims=ds[data_table].dims) if inplace: ds[new_table_name] = svd_rr_approx return None else: ds_copy = copy.deepcopy(ds) ds_copy[new_table_name] = svd_rr_approx return ds_copy
[docs]def svd_aa_loc( ds, rank=1, data_table="enrichment", protein_name_column="Protein", location_col="Loc", aa_sub_col="aa_sub", inplace=True, new_table_name="svd_rr", ): """ Compute singular value decomposition rank reduction on the aa / loc matrix by pivoting before computing decomposition and re-shaping to add to the dataset. Note ---- This function is meant to be used specifically with phage-dms data where the peptide table includes a "loc" column, and an "aa_sub_col" which specifies the amino acid at that location. Parameters ---------- ds : xarray.DataSet The dataset you would like to fit to data_table : str The name of the enrichment layer you would like to fit mlxp to. protein_name_column : str The peptide table feature which specifies which protein a specific peptide derives from. location_col : str The peptide table feature which specifies the site that a particular peptide is centered at. aa_sub_col : str The peptide table feature which specifies the amino acid at a given site. new_table_name : str The name of the new layer you would like to append to the dataset. inplace : bool If True, then this function appends a dataArray to ds which is indexed with the same coordinate dimensions as 'data_table'. If False, a copy of ds is returned with the appended dataArray Returns ------- xarray.DataSet : If inplace is False, return a new DataSet object which has the enrichment values appended """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) low_rank_dt = copy.deepcopy(ds[data_table].to_pandas()) for prot, prot_ds in iter_groups(ds, protein_name_column, "peptide"): for sid in prot_ds.sample_id.values: # grab the single sample ds rep_ds = prot_ds.loc[ dict( sample_id=[sid], sample_metadata=["sample_ID"], peptide_metadata=[aa_sub_col, location_col], ) ] # melt tidy = tidy_ds(rep_ds) # Pivot so that we get the (aa X Loc) piv = tidy.pivot(index=aa_sub_col, columns=location_col, values=data_table) # Preserve the indices for population of new enrichment table piv_index = tidy.pivot( index=aa_sub_col, columns=location_col, values="peptide_id" ) # compute rank reduction decompisition matrices U, S, V = svd(piv) # Grab the first X outer products in the finite summation of rank layers. low_rank = U[:, :rank] @ np.diag(S[:rank]) @ V[:rank, :] low_rank_piv = pd.DataFrame(low_rank, index=piv.index, columns=piv.columns) melted_values = pd.melt(low_rank_piv.reset_index(), id_vars=[aa_sub_col]) melted_index = pd.melt(piv_index.reset_index(), id_vars=[aa_sub_col]) melted_values["peptide_id"] = melted_index["value"] low_rank_dt.loc[melted_values["peptide_id"], sid] = melted_values[ "value" ].values svd_rr_approx = xr.DataArray(low_rank_dt, dims=ds[data_table].dims) if inplace: ds[new_table_name] = svd_rr_approx return None else: ds_copy = copy.deepcopy(ds) ds_copy[new_table_name] = svd_rr_approx return ds_copy
[docs]def differential_selection_wt_mut( ds, data_table="enrichment", scaled_by_wt=False, smoothing_flank_size=0, groupby=["Protein"], loc_column="Loc", is_wt_column="is_wt", inplace=True, new_table_name="wt_mutant_differential_selection", relu_bias=None, skip_samples=set(), ): r"""A generalized function to compute differential selection of amino acid variants in relation to the wildtype sequence. The function computed log fold change between enrichments of a wildtype and mutation at any given site. Concretely, given some site, :math:`s` (defined by `loc_column` feature in the peptide table) in the enrichment matrix, :math:`X`, the differential selection of any mutation with enrichment, :math:`m`, at a site with wildtype enrichment, :math:`wt`, is :math:`\log_{2}(m/wt)`. Note ---- This function is meant to be used specifically with phage-dms data where the peptide table includes a "loc" column, and an "aa_sub_col" which specifies the amino acid at that location. Note ---- This calculation of differential selection is derived from the Bloom Lab's formulated from https://jbloomlab.github.io/dms_tools2/diffsel.html#id5 Parameters ---------- ds : xarray.DataSet The dataset you would like to fit to data_table : str The name of the enrichment layer you would like to fit mlxp to. scaled_by_wt : bool A boolean flag indicating whether or not you would like to multiply the differential selection value by the wildtype enrichment smoothing_flank_size : int This parameter should only be used if **scaled_by_wt** is true. By specifying an integer greater than 0, you are scaling the differential selection value by enrichment values of the wildtype peptides surrounding , in both directions, a given site. The integer specified here then determines how many peptides are used for the scaling in both directions. groupby: list[str] This will specify which peptide feature groups such that site-mutation combinations are unique. loc_column : str The peptide table feature which specifies the site that a particular peptide is centered at. is_wt_column : str The column specifying which peptides are wildtype. relu_bias : int If an integer is specified, then enrichment values less than 1 are replaced by the specified value before computing differential selection. skip_samples : set sample id's which you do not want to calculate the differential selection on, such as controls. This function has many nested loops, so avoid computing on unnecessary samples. new_table_name : str The name of the new layer you would like to append to the dataset. inplace : bool If True, then this function appends a dataArray to ds which is indexed with the same coordinate dimensions as 'data_table'. If False, a copy of ds is returned with the appended dataArray Returns ------- xarray.DataSet : If inplace is False, return a new DataSet object which has the enrichment values appended """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) diff_sel = copy.deepcopy(ds[data_table]) sw = smoothing_flank_size for group, group_ds in iter_groups(ds, groupby, "peptide"): wt_pep_id = id_query( group_ds, f"{is_wt_column} == True", "peptide" ) group_loc = group_ds.peptide_table.loc[wt_pep_id, loc_column].values for i, loc in enumerate(group_loc): loc_pid = id_query( group_ds, f"{loc_column} == {loc}", "peptide" ) loc_ds = group_ds.loc[dict(peptide_id=loc_pid)] # check that skip samples is of type list sams = set(loc_ds.sample_id.values) - set(skip_samples) for sam_id in sams: wt_seq_enr = group_ds[data_table].loc[wt_pep_id, sam_id].values wt_enrichment = float(wt_seq_enr[i]) scalar = ( _wt_window_scalar(list(wt_seq_enr), i, sw) if scaled_by_wt else 1 ) values = loc_ds[data_table].loc[:, sam_id].values if relu_bias is not None: values[values < 1] = relu_bias dsel = _comp_diff_sel(wt_enrichment, values, scalar) diff_sel.loc[list(loc_ds.peptide_id.values), sam_id] = dsel assert diff_sel.loc[wt_pep_id[0], sam_id] == 0.0 if inplace: ds[new_table_name] = diff_sel return None else: ds_copy = copy.deepcopy(ds) ds_copy[new_table_name] = diff_sel return ds_copy
def _wt_window_scalar(wt_enr, i, flank_size): """ Get a scalar from a wt sequence with a certain flank size. """ if flank_size == 0: return wt_enr[i] lcase = i - flank_size < 0 rcase = i + flank_size + 1 > len(wt_enr) lflank = wt_enr[i - flank_size : i] if not lcase else wt_enr[:i] rflank = wt_enr[i : i + flank_size + 1] if not rcase else wt_enr[i:] window_enr = lflank + rflank return sum(window_enr) / len(window_enr)
[docs]def differential_selection_sample_groups( ds, sample_feature="library_batch", is_equal_to="batch_a", data_table="counts", aggregate_function=np.mean, inplace=True, new_table_name="sample_group_differential_selection", ): """This function computes differential selection between groups of samples. Note ---- This function is still experimental. Parameters ---------- ds : xarray.DataSet The dataset you would like to fit to data_table : str The name of the enrichment layer you would like to fit mlxp to. new_table_name : str The name of the new layer you would like to append to the dataset. inplace : bool If True, then this function appends a dataArray to ds which is indexed with the same coordinate dimensions as 'data_table'. If False, a copy of ds is returned with the appended dataArray Returns ------- xarray.DataSet : If inplace is False, return a new DataSet object which has the enrichment values appended """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) diff_sel = copy.deepcopy(ds[data_table]) group_id = id_query( ds, f"{sample_feature} == '{is_equal_to}'" ) group_enrichments = ds[data_table].loc[:, group_id].values group_agg = np.apply_along_axis(aggregate_function, 1, group_enrichments) for agg_enrich, peptide_id in zip(group_agg, ds.peptide_id.values): all_other_sample_values = ds[data_table].loc[peptide_id, :].values peptide_diff_sel = _comp_diff_sel(agg_enrich, all_other_sample_values) diff_sel.loc[peptide_id, :] = peptide_diff_sel if inplace: ds[new_table_name] = diff_sel return None else: ds_copy = copy.deepcopy(ds) ds_copy[new_table_name] = diff_sel return ds_copy
def _comp_diff_sel(base, all_other_values, scalar=1): """ a private function to compute differential selection of one values to a list of other values. Optionally, you can scale each of the differential selection values by the base if desired. """ if np.any(np.array(all_other_values) == 0): raise ZeroDivisionError( f"All values for which we are computing differential selection must be non-zero" ) diff_sel = np.array([np.log2(v / base) for v in all_other_values]) return diff_sel * scalar
[docs]def size_factors(ds, inplace=True, data_table="counts", new_table_name="size_factors"): r""" Warning ------- This method is deprecated. It is currently maintained only for reproducibility of previous results. Compute size factors from `Anders and Huber 2010 <https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106>`_. Concretely, given a Matrix of enrichments, :math:`X_{i,j}`, in the `phippery` dataset with shape (peptides, samples). Consider a *pseudo-reference sample* where the count of each peptide is the geometric mean of counts for that peptide across the samples in the dataset. For a sample to be normalized, calculate for each peptide the ratio of peptide count in the sample to the peptide count in the pseudo-reference sample. The median of this ratio, :math:`\hat S_{j}`, is the size factor to normalize all counts in sample :math:`j`, .. math:: \hat S_{j} = {median \atop i} \frac{X_{i,j}}{(\prod_{v=1}^{m}{X_{i,v}})^{1/m}} Parameters ---------- ds : xarray.DataSet The dataset you would like to fit to data_table : str The name of the enrichment layer you would like to fit mlxp to. new_table_name : str The name of the new layer you would like to append to the dataset. inplace : bool If True, then this function appends a dataArray to ds which is indexed with the same coordinate dimensions as 'data_table'. If False, a copy of ds is returned with the appended dataArray Returns ------- xarray.DataSet : If inplace is False, return a new DataSet object which has the enrichment values appended """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) size_factors = _comp_size_factors(ds[data_table].to_pandas().values) sf_da = xr.DataArray(size_factors, dims=ds[data_table].dims) if inplace: ds[new_table_name] = sf_da return None else: ds_copy = copy.deepcopy(ds) ds_copy[new_table_name] = sf_da return ds_copy
def _comp_size_factors(counts): size_factors = copy.deepcopy(counts) masked = np.ma.masked_equal(counts, 0) if type(masked.mask) != np.ndarray: bool_mask = np.full(counts.shape, False, dtype=bool) else: bool_mask = ~masked.mask geom_means = ( np.ma.exp(np.ma.log(masked).sum(axis=1) / (bool_mask).sum(axis=1)) .data[np.newaxis] .T ) size_factors = ( size_factors / np.ma.median(masked / geom_means, axis=0).data ).round(2) return size_factors
[docs]def counts_per_million( ds, inplace=True, new_table_name="cpm", per_sample=True, data_table="counts" ): r"""Compute counts per million. Concretely, given a Matrix of enrichments, :math:`X`, in the `phippery` dataset with shape P peptides and S samples, we compute the :math:`i^{th}` peptide and :math:`j^{th}` sample position like so: .. math:: \text{cpm}(X,i,j) = \left\{ \begin{array}{@{}ll@{}} X_{i,j}/\sum_{p\in P} X_{p,j} \times 10^6, & \text{if per_sample is True} \\[10pt] X_{i,j}/\sum_{p\in P}\sum_{s\in S} X_{p,s} \times 10^6, & \text{if per_sample is False} \end{array} \right. Parameters ---------- ds : xarray.DataSet The dataset you would like to fit to per_sample : bool If True, compute counts per million separately for each sample. Otherwise, frequencies are computed as a ratio of each count to the sum of all counts in the dataset. data_table : str The name of the enrichment layer you would like to fit mlxp to. new_table_name : str The name of the new layer you would like to append to the dataset. inplace : bool If True, then this function appends a dataArray to ds which is indexed with the same coordinate dimensions as 'data_table'. If False, a copy of ds is returned with the appended dataArray Returns ------- xarray.DataSet : If inplace is False, return a new DataSet object which has the enrichment values appended Example ------- >>> ds.counts.values array([[8, 8, 3, 1], [1, 3, 7, 4], [9, 0, 5, 1], [5, 2, 4, 4], [1, 4, 1, 3], [3, 4, 4, 0], [4, 8, 2, 7], [4, 3, 6, 5], [0, 2, 9, 1], [0, 5, 6, 5]]) >>> from phippery.normalize import counts_per_million >>> counts_per_million(ds, new_table_name="cpm", inplace=True) >>> ds.cpm.values array([[228571.43, 205128.21, 63829.79, 32258.06], [ 28571.43, 76923.08, 148936.17, 129032.26], [257142.86, 0. , 106382.98, 32258.06], [142857.14, 51282.05, 85106.38, 129032.26], [ 28571.43, 102564.1 , 21276.6 , 96774.19], [ 85714.29, 102564.1 , 85106.38, 0. ], [114285.71, 205128.21, 42553.19, 225806.45], [114285.71, 76923.08, 127659.57, 161290.32], [ 0. , 51282.05, 191489.36, 32258.06], [ 0. , 128205.13, 127659.57, 161290.32]]) """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) counts = ds[data_table].to_pandas().values cpm = _comp_cpm_per_sample(counts) if per_sample else _comp_cpm(counts) cpm_da = xr.DataArray(cpm, dims=ds[data_table].dims) if inplace: ds[new_table_name] = cpm_da return None else: ds_copy = copy.deepcopy(ds) ds_copy[new_table_name] = cpm_da return ds_copy
def _comp_cpm(counts): ret = copy.deepcopy(counts) return (ret / (ret.sum() / 1e6)).round(2) def _comp_cpm_per_sample(counts): ret = copy.deepcopy(counts) return (ret / (ret.sum(axis=0) / 1e6)).round(2)
[docs]def rank_data( ds, data_table="counts", inplace=True, per_sample=False, new_table_name=f"rank", ): """Compute the rank of specified enrichment layer. The rank is decending Parameters ---------- ds : xarray.DataSet The dataset you would like to fit to per_sample : bool If True, compute rank separately for each sample. Otherwise, frequencies are computed as a ratio of each count to the sum of all counts in the dataset. data_table : str The name of the enrichment layer you would like to fit mlxp to. new_table_name : str The name of the new layer you would like to append to the dataset. inplace : bool If True, then this function appends a dataArray to ds which is indexed with the same coordinate dimensions as 'data_table'. If False, a copy of ds is returned with the appended dataArray Returns ------- xarray.DataSet : If inplace is False, return a new DataSet object which has the enrichment values appended Example ------- >>> ds["counts"].values array([[533, 734, 399, 588], [563, 947, 814, 156], [ 47, 705, 750, 685], [675, 118, 897, 290], [405, 880, 772, 570], [629, 961, 530, 63], [633, 931, 268, 115], [833, 290, 164, 184], [ 18, 704, 359, 33], [143, 486, 371, 415]]) >>> rank_data(ds, data_table="counts", per_sample=True) >>> ds["rank"].values array([[4, 5, 4, 8], [5, 8, 8, 3], [1, 4, 6, 9], [8, 0, 9, 5], [3, 6, 7, 7], [6, 9, 5, 1], [7, 7, 1, 2], [9, 1, 0, 4], [0, 3, 2, 0], [2, 2, 3, 6]]) """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) counts = ds[data_table].to_pandas().values cpm = _comp_rank_per_sample(counts) if per_sample else _comp_rank(counts) cpm_da = xr.DataArray(cpm, dims=ds[data_table].dims) if inplace: ds[new_table_name] = cpm_da return None else: ds_copy = copy.deepcopy(ds) ds_copy[new_table_name] = cpm_da return ds_copy
def _comp_rank(enrichment): ret = np.ones(enrichment.shape) sample_data_sh = enrichment.shape sample_data = enrichment.flatten() temp = sample_data.argsort() ranks = np.empty_like(temp) ranks[temp] = np.arange(len(sample_data)) ret[:, :] = ranks.reshape(sample_data_sh) return ret.astype(int) def _comp_rank_per_sample(enrichment): ret = np.ones(enrichment.shape) for sid in range(enrichment.shape[1]): sample_data = enrichment[:, sid] temp = sample_data.argsort() ranks = np.empty_like(temp) ranks[temp] = np.arange(len(sample_data)) ret[:, sid] = ranks.flatten() return ret.astype(int)