Source code for phippery.eigen

r"""
=================
Eigen
=================

A method for MDS via Singular Value Decomposition 
on phip-seq datasets.
"""

import numpy as np
from numpy.linalg import svd
import xarray as xr
import pandas as pd
import scipy.stats as st
import itertools
import copy

[docs]def eigenassay_projections( ds, data_table="counts", compute_correlations=False, return_raw_decomposition=False, n_eigenvectors=None, ): r"""Compute the Singular Value Decomposition of the enrichment data before projecting each sample into the first n eigenvectors ("eigenassays") in the dataset. Concretely, given a Matrix of, :math:`X` enrichments in the `phippery` dataset with shape (peptides, samples). We compute the decomposition :math:`X = USV^{T}` The principal axes in feature space are then represented by the columns of :math:`V` and represent the direction of maximum variance in the data. The sample projections into this space are then computed and tied to the sample annotation in the returned dictionary. Parameters ---------- ds : xarray.DataSet The dataset you would like to perform eigen-decomposition on. data_table : str The name of the enrichment layer in the phippery dataset to perform the operation on. compute_correlations : bool If true, compute the correlation of a sample's true binding enrichments to the n'th principal axes. These will be added in the same fashion as the sample scores that are appended to the sample table. return_raw_decomposition : bool If true, include the raw :math:`USV^{T}` decomposition of the enrichment matrix specified n_eigenvectors : int the number of projections "eigenassay dimensions" to include. Returns ------- dict : 1. The eigenassay projects tied with the appended to sample annotations included in `ds`. 2. (optional) The raw "economy" SVD decomposition matrices. """ if data_table not in ds: avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]) raise KeyError( f"{data_table} is not included in dataset. \n available datasets: {avail}" ) n_eigenvectors = len(ds.sample_id) if n_eigenvectors is None else n_eigenvectors if n_eigenvectors > len(ds.sample_id): raise ValueError( f"n_eigenvectors must be less than or equal to the number of samples in the dataset" ) a = ds[f"{data_table}"].values U, S, V = svd(a, full_matrices=False) sam_meta = ds.sample_table.to_pandas() z_jk = np.zeros([n_eigenvectors, n_eigenvectors]) if compute_correlations: p_jk = np.zeros([n_eigenvectors, n_eigenvectors]) for j in range(n_eigenvectors): for k in range(n_eigenvectors): z_jk[j, k] = np.dot(a[:, j], U[:, k]) if compute_correlations: p_jk[j, k] = st.pearsonr(a[:, j], U[:, k])[0] for k in range(n_eigenvectors): sam_meta[f"Eigenassay-{k}-projection"] = z_jk[:, k] if compute_correlations: sam_meta[f"Eigenassay-{k}-correlation"] = p_jk[:, k] ret = {"sample_eigenassay_projections": sam_meta} if return_raw_decomposition: ret["raw_decomposition"] = (U, S, V) return ret