"""
=================
Normalize
=================
A set of useful functions for normalizing enrichments
in a phippery dataset.
"""
import numpy as np
from numpy.linalg import svd
import xarray as xr
import pandas as pd
import itertools
import copy
from phippery.utils import iter_groups
from phippery.utils import id_query
from phippery.utils import get_annotation_table
[docs]def standardized_enrichment(
ds,
lib_ds,
beads_ds,
data_table="counts",
inplace=True,
new_table_name="std_enrichment",
):
"""Compute standardized enrichment of sample counts.
This is the *fold enrichment* of each sample's frequency
compared to the library average frequency, minus the mock IP
(beads only control) frequency.
Note
----
Psuedo counts and exact calculation are
derived from the Bloom Lab's formulated normalization
heuristic for differential selection. See
https://jbloomlab.github.io/dms_tools2/diffsel.html#id5
Parameters
----------
ds : xarray.DataSet
The dataset you would like to fit to
lib_ds : xarray.DataSet
The dataset of phage library control samples used in normalization
beads_ds : xarray.DataSet
The dataset of beads only control samples used in normalization
data_table : str
The name of the enrichment layer you would like to fit mlxp to.
new_table_name : str
The name of the new layer you would like to append to the dataset.
inplace : bool
If True, then this function
appends a dataArray to ds which is indexed with the same coordinate dimensions as
'data_table'. If False, a copy of ds is returned with the appended dataArray
Returns
-------
xarray.DataSet :
If inplace is False, return a new DataSet object which has
the enrichment values appended
Example
-------
Given a dataset, ``ds``, with the following samples and
counts across 10 peptides:
>>> phippery.get_annotation_table(ds, "sample")
sample_metadata fastq_filename reference seq_dir sample_type
sample_id
0 sample_0.fastq refa expa beads_only
1 sample_1.fastq refa expa beads_only
2 sample_2.fastq refa expa library
3 sample_3.fastq refa expa library
4 sample_4.fastq refa expa IP
5 sample_5.fastq refa expa IP
6 sample_6.fastq refa expa IP
7 sample_7.fastq refa expa IP
8 sample_8.fastq refa expa IP
9 sample_9.fastq refa expa IP
>>> ds.counts
<xarray.DataArray 'counts' (peptide_id: 5, sample_id: 10)>
array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
Coordinates:
* sample_id (sample_id) int64 0 1 2 3 4 5 6 7 8 9
* peptide_id (peptide_id) int64 0 1 2 3 4
First, separate the dataset into it's various samples types using
:func:`phippery.utils.ds_query`
>>> ip_ds = ds_query(ds, "sample_type == 'IP'")
>>> lib_ds = ds_query(ds, "sample_type == 'library'")
>>> mockip_ds = ds_query(ds, "sample_type == 'beads_only'")
We can then compute standardized fold enrichment like so:
>>> phippery.normalize.standardized_enrichment(ip_ds, lib_ds, mockip_ds)
which will modify the ``ip_ds`` dataset inplace to include a new table
>>> ip_ds.std_enrichment
<xarray.DataArray 'std_enrichment' (peptide_id: 5, sample_id: 6)>
array([[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.]])
Coordinates:
* sample_id (sample_id) int64 4 5 6 7 8 9
* peptide_id (peptide_id) int64 0 1 2 3 4
Note that we expect the result to be all zeros because
a 1-to-1 fold enrichment for ip's to library samples minus
1-to-1 beads to library
"""
if data_table not in ds:
avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
raise KeyError(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)
for control in [lib_ds, beads_ds]:
if type(control) != xr.Dataset:
raise ValueError(
"ds_lib_control_indicies must be of type list, even if there is only a single value"
)
std_enrichments = _comp_std_enr(
counts=ds[data_table].to_pandas(),
lib_counts=lib_ds[data_table].to_pandas(),
mock_ip_counts=beads_ds[data_table].to_pandas(),
)
if inplace:
ds[new_table_name] = xr.DataArray(std_enrichments)
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = xr.DataArray(std_enrichments)
return ds_copy
def _comp_std_enr(counts, lib_counts, mock_ip_counts):
"""Computes standardized enrichment."""
normalized_ds_counts = copy.deepcopy(counts)
# find controls and average all
ds_lib_counts_mean = lib_counts.mean(axis=1)
ds_bead_counts_mean = mock_ip_counts.mean(axis=1)
ds_lib_counts_mean_sum = sum(ds_lib_counts_mean)
# compute beads control std enrichment
pseudo_sample = ds_bead_counts_mean + max(
1, sum(ds_bead_counts_mean) / ds_lib_counts_mean_sum
)
pseudo_lib_counts = ds_lib_counts_mean + max(
1, ds_lib_counts_mean_sum / sum(ds_bead_counts_mean)
)
pseudo_sample_freq = pseudo_sample / sum(pseudo_sample)
pseudo_lib_counts_freq = pseudo_lib_counts / sum(pseudo_lib_counts)
pseudo_bead_enrichment = pseudo_sample_freq / pseudo_lib_counts_freq
# compute all sample standardized enrichment
for sample_id, sample in counts.items():
pseudo_sample = sample + max(1, sum(sample) / ds_lib_counts_mean_sum)
pseudo_lib_counts = ds_lib_counts_mean + max(
1, ds_lib_counts_mean_sum / sum(sample)
)
pseudo_sample_freq = pseudo_sample / sum(pseudo_sample)
pseudo_lib_counts_freq = pseudo_lib_counts / sum(pseudo_lib_counts)
sample_enrichment = pseudo_sample_freq / pseudo_lib_counts_freq
normalized_ds_counts.loc[:, sample_id] = (
sample_enrichment - pseudo_bead_enrichment
)
return normalized_ds_counts
[docs]def enrichment(
ds,
lib_ds,
data_table="counts",
inplace=True,
new_table_name="enrichment",
):
"""This function computes fold enrichment in the same fashion as
the **standardized_enrichment**, but does *not* subtract beads only controls
Parameters
----------
ds : xarray.DataSet
The dataset you would like to fit to
lib_ds : xarray.DataSet
The dataset of phage library control samples used in normalization
data_table : str
The name of the enrichment layer you would like to fit mlxp to.
new_table_name : str
The name of the new layer you would like to append to the dataset.
inplace : bool
If True, then this function
appends a dataArray to ds which is indexed with the same coordinate dimensions as
'data_table'. If False, a copy of ds is returned with the appended dataArray
Returns
-------
xarray.DataSet :
If inplace is False, return a new DataSet object which has
the enrichment values appended
"""
if data_table not in ds:
avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
raise KeyError(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)
if type(lib_ds) != xr.Dataset:
raise ValueError(
"ds_lib_control_indicies must be of type list, even if there is only a single value"
)
enrichments = _comp_enr(
counts=ds[data_table].to_pandas(), lib_counts=lib_ds[data_table].to_pandas()
)
if inplace:
ds[new_table_name] = xr.DataArray(enrichments)
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = xr.DataArray(enrichments)
return ds_copy
def _comp_enr(counts, lib_counts):
"""Compute enrichment values."""
# we are going to add an augmented counts matrix
enrichments = copy.deepcopy(counts)
# find controls and average all
lib_counts_mean = lib_counts.mean(axis=1)
lib_counts_mean_sum = sum(lib_counts_mean)
# compute all sample standardized enrichment
for sample_id, sample in enrichments.items():
pseudo_sample = sample + max(1, sum(sample) / lib_counts_mean_sum)
pseudo_lib_control = lib_counts_mean + max(1, lib_counts_mean_sum / sum(sample))
pseudo_sample_freq = pseudo_sample / sum(pseudo_sample)
pseudo_lib_control_freq = pseudo_lib_control / sum(pseudo_lib_control)
sample_enrichment = pseudo_sample_freq / pseudo_lib_control_freq
enrichments.loc[:, sample_id] = sample_enrichment
return enrichments
[docs]def svd_rank_reduction(
ds,
rank=1,
data_table="enrichment",
inplace=True,
new_table_name="svd_rr",
):
r"""
This function computes the singular value decomposition,
then recomputes the enrichment matrix up to the rank specified.
Concretely, given a Matrix of, :math:`X` enrichments in the
`phippery` dataset with shape (peptides, samples). We compute
the decomposition :math:`X = USV^{T}`, then return the
recomposition using the first
``rank`` eigenvectors and eigenvaluesvalues.
Parameters
----------
ds : xarray.DataSet
The dataset you would like to fit to
rank : int
The number of ranks to include in the reconstruction.
data_table : str
The name of the enrichment layer you would like to fit mlxp to.
new_table_name : str
The name of the new layer you would like to append to the dataset.
inplace : bool
If True, then this function
appends a dataArray to ds which is indexed with the same coordinate dimensions as
'data_table'. If False, a copy of ds is returned with the appended dataArray
Returns
-------
xarray.DataSet :
If inplace is False, return a new DataSet object which has
the enrichment values appended
"""
if data_table not in ds:
avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
raise KeyError(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)
low_rank_dt = copy.deepcopy(ds[data_table].to_pandas())
# compute rank reduction decomposition matrices
U, S, V = svd(low_rank_dt.values)
# Grab the first X outer products in the finite summation of rank layers.
low_rank = U[:, :rank] @ np.diag(S[:rank]) @ V[:rank, :]
low_rank_dt.loc[:, :] = low_rank
svd_rr_approx = xr.DataArray(low_rank_dt, dims=ds[data_table].dims)
if inplace:
ds[new_table_name] = svd_rr_approx
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = svd_rr_approx
return ds_copy
[docs]def svd_aa_loc(
ds,
rank=1,
data_table="enrichment",
protein_name_column="Protein",
location_col="Loc",
aa_sub_col="aa_sub",
inplace=True,
new_table_name="svd_rr",
):
"""
Compute singular value decomposition rank reduction
on the aa / loc matrix by pivoting before computing decomposition
and re-shaping to add to the dataset.
Note
----
This function is meant to be used specifically with phage-dms data
where the peptide table includes a "loc" column, and an "aa_sub_col"
which specifies the amino acid at that location.
Parameters
----------
ds : xarray.DataSet
The dataset you would like to fit to
data_table : str
The name of the enrichment layer you would like to fit mlxp to.
protein_name_column : str
The peptide table feature which specifies which protein a specific peptide
derives from.
location_col : str
The peptide table feature which specifies the site that a particular peptide
is centered at.
aa_sub_col : str
The peptide table feature which specifies the amino acid at a given site.
new_table_name : str
The name of the new layer you would like to append to the dataset.
inplace : bool
If True, then this function
appends a dataArray to ds which is indexed with the same coordinate dimensions as
'data_table'. If False, a copy of ds is returned with the appended dataArray
Returns
-------
xarray.DataSet :
If inplace is False, return a new DataSet object which has
the enrichment values appended
"""
if data_table not in ds:
avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
raise KeyError(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)
low_rank_dt = copy.deepcopy(ds[data_table].to_pandas())
for prot, prot_ds in iter_groups(ds, protein_name_column, "peptide"):
for sid in prot_ds.sample_id.values:
# grab the single sample ds
rep_ds = prot_ds.loc[
dict(
sample_id=[sid],
sample_metadata=["sample_ID"],
peptide_metadata=[aa_sub_col, location_col],
)
]
# melt
tidy = tidy_ds(rep_ds)
# Pivot so that we get the (aa X Loc)
piv = tidy.pivot(index=aa_sub_col, columns=location_col, values=data_table)
# Preserve the indices for population of new enrichment table
piv_index = tidy.pivot(
index=aa_sub_col, columns=location_col, values="peptide_id"
)
# compute rank reduction decompisition matrices
U, S, V = svd(piv)
# Grab the first X outer products in the finite summation of rank layers.
low_rank = U[:, :rank] @ np.diag(S[:rank]) @ V[:rank, :]
low_rank_piv = pd.DataFrame(low_rank, index=piv.index, columns=piv.columns)
melted_values = pd.melt(low_rank_piv.reset_index(), id_vars=[aa_sub_col])
melted_index = pd.melt(piv_index.reset_index(), id_vars=[aa_sub_col])
melted_values["peptide_id"] = melted_index["value"]
low_rank_dt.loc[melted_values["peptide_id"], sid] = melted_values[
"value"
].values
svd_rr_approx = xr.DataArray(low_rank_dt, dims=ds[data_table].dims)
if inplace:
ds[new_table_name] = svd_rr_approx
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = svd_rr_approx
return ds_copy
[docs]def differential_selection_wt_mut(
ds,
data_table="enrichment",
scaled_by_wt=False,
smoothing_flank_size=0,
groupby=["Protein"],
loc_column="Loc",
is_wt_column="is_wt",
inplace=True,
new_table_name="wt_mutant_differential_selection",
relu_bias=None,
skip_samples=set(),
):
r"""A generalized function to compute differential selection
of amino acid variants in relation to the wildtype sequence.
The function computed log fold change between enrichments
of a wildtype and mutation at any given site.
Concretely, given some site, :math:`s` (defined by `loc_column`
feature in the peptide table) in the enrichment
matrix, :math:`X`,
the differential selection of any mutation with enrichment, :math:`m`,
at a site with wildtype enrichment, :math:`wt`, is :math:`\log_{2}(m/wt)`.
Note
----
This function is meant to be used specifically with phage-dms data
where the peptide table includes a "loc" column, and an "aa_sub_col"
which specifies the amino acid at that location.
Note
----
This calculation of differential selection
is derived from the Bloom Lab's formulated from
https://jbloomlab.github.io/dms_tools2/diffsel.html#id5
Parameters
----------
ds : xarray.DataSet
The dataset you would like to fit to
data_table : str
The name of the enrichment layer you would like to fit mlxp to.
scaled_by_wt : bool
A boolean flag indicating whether or not you would like to multiply the
differential selection value by the wildtype enrichment
smoothing_flank_size : int
This parameter should only be used if **scaled_by_wt** is true.
By specifying an integer greater than 0, you are scaling the differential
selection value by enrichment values of the wildtype peptides surrounding
, in both directions, a given site. The integer specified here then
determines how many peptides are used for the scaling in both directions.
groupby: list[str]
This will specify which peptide feature groups such that site-mutation
combinations are unique.
loc_column : str
The peptide table feature which specifies the site that a particular peptide
is centered at.
is_wt_column : str
The column specifying which peptides are wildtype.
relu_bias : int
If an integer is specified, then enrichment values less than
1 are replaced by the specified value before computing differential
selection.
skip_samples : set
sample id's which you do not want to calculate the differential selection on,
such as controls. This function has many nested loops, so avoid computing
on unnecessary samples.
new_table_name : str
The name of the new layer you would like to append to the dataset.
inplace : bool
If True, then this function
appends a dataArray to ds which is indexed with the same coordinate dimensions as
'data_table'. If False, a copy of ds is returned with the appended dataArray
Returns
-------
xarray.DataSet :
If inplace is False, return a new DataSet object which has
the enrichment values appended
"""
if data_table not in ds:
avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
raise KeyError(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)
diff_sel = copy.deepcopy(ds[data_table])
sw = smoothing_flank_size
for group, group_ds in iter_groups(ds, groupby, "peptide"):
wt_pep_id = id_query(
group_ds, f"{is_wt_column} == True", "peptide"
)
group_loc = group_ds.peptide_table.loc[wt_pep_id, loc_column].values
for i, loc in enumerate(group_loc):
loc_pid = id_query(
group_ds, f"{loc_column} == {loc}", "peptide"
)
loc_ds = group_ds.loc[dict(peptide_id=loc_pid)]
# check that skip samples is of type list
sams = set(loc_ds.sample_id.values) - set(skip_samples)
for sam_id in sams:
wt_seq_enr = group_ds[data_table].loc[wt_pep_id, sam_id].values
wt_enrichment = float(wt_seq_enr[i])
scalar = (
_wt_window_scalar(list(wt_seq_enr), i, sw) if scaled_by_wt else 1
)
values = loc_ds[data_table].loc[:, sam_id].values
if relu_bias is not None:
values[values < 1] = relu_bias
dsel = _comp_diff_sel(wt_enrichment, values, scalar)
diff_sel.loc[list(loc_ds.peptide_id.values), sam_id] = dsel
assert diff_sel.loc[wt_pep_id[0], sam_id] == 0.0
if inplace:
ds[new_table_name] = diff_sel
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = diff_sel
return ds_copy
def _wt_window_scalar(wt_enr, i, flank_size):
"""
Get a scalar from a wt sequence with a certain flank size.
"""
if flank_size == 0:
return wt_enr[i]
lcase = i - flank_size < 0
rcase = i + flank_size + 1 > len(wt_enr)
lflank = wt_enr[i - flank_size : i] if not lcase else wt_enr[:i]
rflank = wt_enr[i : i + flank_size + 1] if not rcase else wt_enr[i:]
window_enr = lflank + rflank
return sum(window_enr) / len(window_enr)
[docs]def differential_selection_sample_groups(
ds,
sample_feature="library_batch",
is_equal_to="batch_a",
data_table="counts",
aggregate_function=np.mean,
inplace=True,
new_table_name="sample_group_differential_selection",
):
"""This function computes differential selection
between groups of samples.
Note
----
This function is still experimental.
Parameters
----------
ds : xarray.DataSet
The dataset you would like to fit to
data_table : str
The name of the enrichment layer you would like to fit mlxp to.
new_table_name : str
The name of the new layer you would like to append to the dataset.
inplace : bool
If True, then this function
appends a dataArray to ds which is indexed with the same coordinate dimensions as
'data_table'. If False, a copy of ds is returned with the appended dataArray
Returns
-------
xarray.DataSet :
If inplace is False, return a new DataSet object which has
the enrichment values appended
"""
if data_table not in ds:
avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
raise KeyError(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)
diff_sel = copy.deepcopy(ds[data_table])
group_id = id_query(
ds, f"{sample_feature} == '{is_equal_to}'"
)
group_enrichments = ds[data_table].loc[:, group_id].values
group_agg = np.apply_along_axis(aggregate_function, 1, group_enrichments)
for agg_enrich, peptide_id in zip(group_agg, ds.peptide_id.values):
all_other_sample_values = ds[data_table].loc[peptide_id, :].values
peptide_diff_sel = _comp_diff_sel(agg_enrich, all_other_sample_values)
diff_sel.loc[peptide_id, :] = peptide_diff_sel
if inplace:
ds[new_table_name] = diff_sel
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = diff_sel
return ds_copy
def _comp_diff_sel(base, all_other_values, scalar=1):
"""
a private function to compute differential selection of one values to a list of other values.
Optionally, you can scale each of the differential selection values by the base if desired.
"""
if np.any(np.array(all_other_values) == 0):
raise ZeroDivisionError(
f"All values for which we are computing differential selection must be non-zero"
)
diff_sel = np.array([np.log2(v / base) for v in all_other_values])
return diff_sel * scalar
[docs]def size_factors(ds, inplace=True, data_table="counts", new_table_name="size_factors"):
r"""
Warning
-------
This method is deprecated. It is currently maintained only for reproducibility of previous results.
Compute size factors from
`Anders and Huber 2010
<https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106>`_.
Concretely, given a Matrix of enrichments, :math:`X_{i,j}`, in the
`phippery` dataset with shape (peptides, samples). Consider a *pseudo-reference sample*
where the count of each peptide is the geometric mean of counts for that peptide across
the samples in the dataset. For a sample to be normalized, calculate for each peptide
the ratio of peptide count in the sample to the peptide count in the pseudo-reference
sample. The median of this ratio, :math:`\hat S_{j}`, is the size factor to normalize
all counts in sample :math:`j`,
.. math::
\hat S_{j} = {median \atop i} \frac{X_{i,j}}{(\prod_{v=1}^{m}{X_{i,v}})^{1/m}}
Parameters
----------
ds : xarray.DataSet
The dataset you would like to fit to
data_table : str
The name of the enrichment layer you would like to fit mlxp to.
new_table_name : str
The name of the new layer you would like to append to the dataset.
inplace : bool
If True, then this function
appends a dataArray to ds which is indexed with the same coordinate dimensions as
'data_table'. If False, a copy of ds is returned with the appended dataArray
Returns
-------
xarray.DataSet :
If inplace is False, return a new DataSet object which has
the enrichment values appended
"""
if data_table not in ds:
avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
raise KeyError(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)
size_factors = _comp_size_factors(ds[data_table].to_pandas().values)
sf_da = xr.DataArray(size_factors, dims=ds[data_table].dims)
if inplace:
ds[new_table_name] = sf_da
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = sf_da
return ds_copy
def _comp_size_factors(counts):
size_factors = copy.deepcopy(counts)
masked = np.ma.masked_equal(counts, 0)
if type(masked.mask) != np.ndarray:
bool_mask = np.full(counts.shape, False, dtype=bool)
else:
bool_mask = ~masked.mask
geom_means = (
np.ma.exp(np.ma.log(masked).sum(axis=1) / (bool_mask).sum(axis=1))
.data[np.newaxis]
.T
)
size_factors = (
size_factors / np.ma.median(masked / geom_means, axis=0).data
).round(2)
return size_factors
[docs]def counts_per_million(
ds, inplace=True, new_table_name="cpm", per_sample=True, data_table="counts"
):
r"""Compute counts per million.
Concretely, given a Matrix of enrichments, :math:`X`, in the
`phippery` dataset with shape P peptides and S samples,
we compute the :math:`i^{th}` peptide and :math:`j^{th}` sample position like so:
.. math::
\text{cpm}(X,i,j) = \left\{
\begin{array}{@{}ll@{}}
X_{i,j}/\sum_{p\in P} X_{p,j} \times 10^6, & \text{if per_sample is True} \\[10pt]
X_{i,j}/\sum_{p\in P}\sum_{s\in S} X_{p,s} \times 10^6, & \text{if per_sample is False}
\end{array}
\right.
Parameters
----------
ds : xarray.DataSet
The dataset you would like to fit to
per_sample : bool
If True, compute counts per million separately for each sample.
Otherwise, frequencies are computed as a ratio of each count to the sum of
all counts in the dataset.
data_table : str
The name of the enrichment layer you would like to fit mlxp to.
new_table_name : str
The name of the new layer you would like to append to the dataset.
inplace : bool
If True, then this function
appends a dataArray to ds which is indexed with the same coordinate dimensions as
'data_table'. If False, a copy of ds is returned with the appended dataArray
Returns
-------
xarray.DataSet :
If inplace is False, return a new DataSet object which has
the enrichment values appended
Example
-------
>>> ds.counts.values
array([[8, 8, 3, 1],
[1, 3, 7, 4],
[9, 0, 5, 1],
[5, 2, 4, 4],
[1, 4, 1, 3],
[3, 4, 4, 0],
[4, 8, 2, 7],
[4, 3, 6, 5],
[0, 2, 9, 1],
[0, 5, 6, 5]])
>>> from phippery.normalize import counts_per_million
>>> counts_per_million(ds, new_table_name="cpm", inplace=True)
>>> ds.cpm.values
array([[228571.43, 205128.21, 63829.79, 32258.06],
[ 28571.43, 76923.08, 148936.17, 129032.26],
[257142.86, 0. , 106382.98, 32258.06],
[142857.14, 51282.05, 85106.38, 129032.26],
[ 28571.43, 102564.1 , 21276.6 , 96774.19],
[ 85714.29, 102564.1 , 85106.38, 0. ],
[114285.71, 205128.21, 42553.19, 225806.45],
[114285.71, 76923.08, 127659.57, 161290.32],
[ 0. , 51282.05, 191489.36, 32258.06],
[ 0. , 128205.13, 127659.57, 161290.32]])
"""
if data_table not in ds:
avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
raise KeyError(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)
counts = ds[data_table].to_pandas().values
cpm = _comp_cpm_per_sample(counts) if per_sample else _comp_cpm(counts)
cpm_da = xr.DataArray(cpm, dims=ds[data_table].dims)
if inplace:
ds[new_table_name] = cpm_da
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = cpm_da
return ds_copy
def _comp_cpm(counts):
ret = copy.deepcopy(counts)
return (ret / (ret.sum() / 1e6)).round(2)
def _comp_cpm_per_sample(counts):
ret = copy.deepcopy(counts)
return (ret / (ret.sum(axis=0) / 1e6)).round(2)
[docs]def rank_data(
ds,
data_table="counts",
inplace=True,
per_sample=False,
new_table_name=f"rank",
):
"""Compute the rank of specified enrichment layer.
The rank is decending
Parameters
----------
ds : xarray.DataSet
The dataset you would like to fit to
per_sample : bool
If True, compute rank separately for each sample.
Otherwise, frequencies are computed as a ratio of each count to the sum of
all counts in the dataset.
data_table : str
The name of the enrichment layer you would like to fit mlxp to.
new_table_name : str
The name of the new layer you would like to append to the dataset.
inplace : bool
If True, then this function
appends a dataArray to ds which is indexed with the same coordinate dimensions as
'data_table'. If False, a copy of ds is returned with the appended dataArray
Returns
-------
xarray.DataSet :
If inplace is False, return a new DataSet object which has
the enrichment values appended
Example
-------
>>> ds["counts"].values
array([[533, 734, 399, 588],
[563, 947, 814, 156],
[ 47, 705, 750, 685],
[675, 118, 897, 290],
[405, 880, 772, 570],
[629, 961, 530, 63],
[633, 931, 268, 115],
[833, 290, 164, 184],
[ 18, 704, 359, 33],
[143, 486, 371, 415]])
>>> rank_data(ds, data_table="counts", per_sample=True)
>>> ds["rank"].values
array([[4, 5, 4, 8],
[5, 8, 8, 3],
[1, 4, 6, 9],
[8, 0, 9, 5],
[3, 6, 7, 7],
[6, 9, 5, 1],
[7, 7, 1, 2],
[9, 1, 0, 4],
[0, 3, 2, 0],
[2, 2, 3, 6]])
"""
if data_table not in ds:
avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
raise KeyError(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)
counts = ds[data_table].to_pandas().values
cpm = _comp_rank_per_sample(counts) if per_sample else _comp_rank(counts)
cpm_da = xr.DataArray(cpm, dims=ds[data_table].dims)
if inplace:
ds[new_table_name] = cpm_da
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = cpm_da
return ds_copy
def _comp_rank(enrichment):
ret = np.ones(enrichment.shape)
sample_data_sh = enrichment.shape
sample_data = enrichment.flatten()
temp = sample_data.argsort()
ranks = np.empty_like(temp)
ranks[temp] = np.arange(len(sample_data))
ret[:, :] = ranks.reshape(sample_data_sh)
return ret.astype(int)
def _comp_rank_per_sample(enrichment):
ret = np.ones(enrichment.shape)
for sid in range(enrichment.shape[1]):
sample_data = enrichment[:, sid]
temp = sample_data.argsort()
ranks = np.empty_like(temp)
ranks[temp] = np.arange(len(sample_data))
ret[:, sid] = ranks.flatten()
return ret.astype(int)