The fundamental inference task is to infer \(p(z_i \mid x_i)\), where \(x_i\) is a \(p\)-dimensional observation, \(z_i\) is a \(k\)-dimensional latent variable, and \(k \ll n\).
Why do we want to do this?
- determine how much variation in the data is explained by known technical factors
- decide whether, and how to remove that variation before trying to explain the data using biological covariates
Importantly, these analyses are not directly usable for confounder correction for QTL mapping. Instead, we first need to learn the underlying distributions of the data and then perform dimensionality reduction on those parameters. However, it will be important to consider what data went into learning those distributions, and how to incorporate known and inferred confounders into that estimation procedure.
Here, we perform the following analyses:
- We perform PCA on the post-QC data and show that most variation is explained by gene detection rate
- We confirm in the real data that the entire distribution of non-zero gene expression is correlated with gene detection rate
- We show that regressing out the percentiles of gene expression eliminates the dependence on gene detection rate
(org-babel-lob-ingest "/home/aksarkar/.emacs.d/org-templates/library.org")
%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])
import colorcet
import functools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st
import sklearn.decomposition as skd
import sklearn.linear_model as sklm
Read the full data matrix and apply the QC filters.
umi = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz', index_col=0)
annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)
keep_genes = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/genes-pass-filter.txt', index_col=0, header=None)
umi = umi.loc[keep_genes.values.ravel(),keep_samples.values.ravel()]
annotations = annotations.loc[keep_samples.values.ravel()]
umi.shape
Use PPCA (Tipping et al 1999) to incorporate gene-specific mean
expression. Use the edgeR
pseudocount.
libsize = annotations['mol_hs'].values
pseudocount = .5 * libsize / libsize.mean()
log_cpm = (np.log(umi + pseudocount) - np.log(libsize + 2 * pseudocount) + 6 * np.log(10)) / np.log(2)
ppca = skd.PCA(n_components=10)
loadings = ppca.fit_transform(log_cpm.values.T)
plt.clf()
fig, ax = plt.subplots(2, 2)
fig.set_size_inches(8, 8)
N = len(set(annotations['batch']))
for i in range(2):
for j in range(i, 2):
for k, batch in enumerate(sorted(set(annotations['batch']))):
ax[i][j].scatter(loadings[annotations['batch'] == batch,i], loadings[annotations['batch'] == batch,j + 1], c=colorcet.cm['rainbow'](k / N), s=4, marker='+', alpha=0.5)
ax[i][j].set_xlabel('PC{}'.format(j + 2))
ax[i][j].set_ylabel('PC{}'.format(i + 1))
ax[1, 0].set_axis_off()
fig.tight_layout()
plt.clf()
fig, ax = plt.subplots(2, 2)
fig.set_size_inches(8, 8)
N = len(set(annotations['batch']))
for i in range(2):
for j in range(i, 2):
for k, batch in enumerate(sorted(set(annotations['batch']))):
ax[i][j].scatter(loadings[annotations['batch'] == batch,i], loadings[annotations['batch'] == batch,j + 1], c=colorcet.cm['rainbow'](k / N), s=4, marker='+', alpha=0.5)
ax[i][j].set_xlabel('PC{}'.format(j + 2))
ax[i][j].set_ylabel('PC{}'.format(i + 1))
ax[1, 0].set_axis_off()
fig.tight_layout()
plt.savefig('pca.pdf')
Correlate PCs with known continuous covariates by computing squared Pearson correlation.
Correlating PCs with individual (or other discrete covariates) is non-obvious because it is a categorical variable, and simply recoding it as integer is sensitive to ordering. Instead, regress the loading of each cell on each principal component \(lij\) against indicator variables for each individual \(Xik\).
\[ lij = ∑_j Xik βjk + μ + ε \]
From the regression fit, we can compute the coefficient of determination \(R^2\) for each PC \(j\):
\[ 1 - \frac{l_j - X \hat{β}_j}{l_j - \bar{l_j}} \]
def extract_covars(annotations):
return pd.Series({
'Reads': annotations['umi'],
'Molecules': annotations['molecules'],
'Mapped %': annotations['umi'] / annotations['mapped'],
'Genes detected': annotations['detect_hs'],
})
def correlation(pcs, cont_covars):
"""Return squared correlation between principal components and covariates
pcs - DataFrame (n x k)
cont_covars - DataFrame (n x q)
"""
result = []
for i in pcs:
for j in cont_covars:
keep = np.isfinite(cont_covars[j].values)
result.append([i, j, np.square(st.pearsonr(pcs[i][keep], cont_covars[j][keep]))[0]])
return pd.DataFrame(result, columns=['pc', 'covar', 'corr'])
def categorical_r2(loadings, annotations, key, name):
categories = sorted(annotations[key].unique())
onehot = np.zeros((annotations.shape[0], len(categories)), dtype=np.float32)
onehot[np.arange(onehot.shape[0]), annotations[key].apply(lambda x: categories.index(x))] = 1
m = sklm.LinearRegression(fit_intercept=True, copy_X=True).fit(onehot, loadings)
return pd.DataFrame({
'pc': np.arange(10),
'covar': name,
'corr': 1 - np.square(loadings - m.predict(onehot)).sum(axis=0) / np.square(loadings - loadings.mean(axis=0)).sum(axis=0)})
cont_covars = annotations.apply(extract_covars, axis=1)
cat_covars = list(zip(annotations[['batch', 'experiment', 'chip_id', 'well']],
['Batch', 'C1 chip', 'Individual', 'Well']))
corr = pd.concat(
[correlation(pd.DataFrame(loadings), cont_covars)] +
[categorical_r2(loadings, annotations, k, n) for k, n in cat_covars])
corr = corr.pivot(index='covar', columns='pc')
def plot_pca_covar_corr(pca, corr):
plt.clf()
fig, ax = plt.subplots(2, 1, gridspec_kw={'height_ratios': [.25, .75]}, sharex=True)
fig.set_size_inches(4, 5)
ax[0].bar(np.arange(len(pca.components_)), pca.explained_variance_ratio_)
ax[0].set_xticks(np.arange(len(pca.components_)))
ax[0].set_xticklabels([str(x) for x in np.arange(1, len(pca.components_) + 1)])
ax[0].set_xlabel('Principal component')
ax[0].set_ylabel('PVE')
im = ax[1].imshow(corr.values, cmap=colorcet.cm['fire'], vmin=0, vmax=1, aspect='auto')
cb = plt.colorbar(im, ax=ax[1], orientation='horizontal')
cb.set_label('Squared correlation')
ax[1].set_xlabel('Principal component')
ax[1].set_yticks(np.arange(corr.shape[0]))
ax[1].set_yticklabels(corr.index)
ax[1].set_ylabel('Covariate')
plt.gcf().tight_layout()
%config InlineBackend.figure_formats = set(['svg'])
plot_pca_covar_corr(ppca, corr)
The top 10 PCs define a low-rank approximation to the original data, so we should ask how good the approximation was, by comparing the distribution of the original data to the distribution of the reconstructed data.
reconstructed = ppca.inverse_transform(loadings)
def plot_reconstruction(obs, approx):
plt.clf()
plt.hist(obs, bins=50, density=True, histtype='step', color='k', label='Observed')
plt.hist(approx, bins=50, density=True, histtype='step', color='r', label='Reconstructed')
plt.legend()
plt.xlabel('$\log_2(\mathrm{CPM} + 1)$')
plt.ylabel('Empirical density')
For genes with high proportion of zero counts, the low-rank approximation is mainly capturing the mean of the data, which is maybe more indicative of the zero proportion in the data rather than the actual mean of the data.
num_zero = np.isclose(umi, 0).sum(axis=1)
max_zero = num_zero.argmax()
plot_reconstruction(log_cpm.iloc[max_zero], reconstructed[:,max_zero])
This is true even for genes with the lowest proportion of zero counts.
min_zero = num_zero.argmin()
plot_reconstruction(log_cpm.iloc[min_zero], reconstructed[:,min_zero])
This might still be OK, if the reconstructed gene expression values are predictive of the original gene expression values.
pred_score = [sklm.LinearRegression(fit_intercept=True).fit(x.values.reshape(-1, 1), y).score(x.values.reshape(-1, 1), y)
for (_, x), (_, y)
in zip(log_cpm.iteritems(),
pd.DataFrame(reconstructed.T).iteritems())]
plt.clf()
plt.hist(pred_score, bins=50)
plt.xlabel('Prediction $R^2$')
plt.ylabel('Number of genes')
plt.title('Correlation between PCA and original data')
The distribution of squared correlations suggest that the low rank approximation is better for some genes than others, i.e. that there could be gene-specific or gene module-specific effects. These are unlikely to be captured by PCA or factor analysis.
Normalize the data.
Compute cosine distance between cells
import sklearn.metrics
distance = sklearn.metrics.pairwise.cosine_distances(log_cpm.values.T)
Compute the embedding
import sklearn.manifold
embeddings = {
perplexity: sklearn.manifold.TSNE(metric='precomputed', perplexity=perplexity).fit_transform(distance)
for perplexity in np.geomspace(5, 50, 5)}
plt.clf()
fig, ax = plt.subplots(1, 5)
fig.set_size_inches(10, 3)
N = len(set(annotations['chip_id']))
for i, k in enumerate(np.geomspace(5, 50, 5)):
for j, individual in enumerate(sorted(set(annotations['chip_id']))):
v = embeddings[k][annotations['chip_id'] == individual]
ax[i].scatter(v[:, 0], v[:, 1], c=colorcet.cm['rainbow'](j / N), s=2)
ax[i].set_title('Perplexity = {:.2g}'.format(k))
fig.tight_layout()
plt.clf()
fig, ax = plt.subplots(1, 5)
fig.set_size_inches(10, 3)
N = len(set(annotations['batch']))
for i, k in enumerate(np.geomspace(5, 50, 5)):
for j, batch in enumerate(sorted(set(annotations['batch']))):
v = embeddings[k][annotations['batch'] == batch]
ax[i].scatter(v[:, 0], v[:, 1], c=colorcet.cm['rainbow'](j / N), s=2)
fig.tight_layout()
If dropout changes the distribution of the non-zero observations, we might hope that fitting a latent variable model which explicitly includes dropout might eliminate that effect. Intuitively, we should downweight the evidence of observed zeroes on the inferred distribution which generated the observations.
However, fitting ZIFA (Pierson et al 2015) on the data again recovers a factor which is detection rate.
<<dim-reduction-imports>>
import ZIFA.block_ZIFA as zifa
<<read-data-qc>>
<<normalize>>
latent, params = zifa.fitModel(Y=np.log(umi.values.T + 1), K=10, p0_thresh=.7)
np.save('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/zifa-loadings.npy', latent)
Run on 28 cores to complete in a reasonable amount of time.
sbatch --partition=broadwl --time=400 --mem=32G -n1 -c28 --exclusive --out=zifa.out --err zifa.err
#!/bin/bash
source activate scqtl
python zifa.py
Submitted batch job 44152013
sacct -j 44152013 -o Elapsed,MaxRSS,MaxVMSize
Elapsed MaxRSS MaxVMSize ---------- ---------- ---------- 02:54:50 02:54:50 23391824K 27240528K
latent = np.load('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/zifa-loadings.npy')
%config InlineBackend.figure_formats = set(['svg'])
corr_zifa = pd.concat(
[correlation(pd.DataFrame(latent), cont_covars)] +
[categorical_r2(latent, annotations, k) for k in cat_covars])
corr_zifa = corr_zifa.pivot(index='covar', columns='pc')
plt.clf()
plt.gcf().set_size_inches(8, 12)
im = plt.imshow(corr_zifa.values, cmap=colorcet.cm['fire'], vmin=0, vmax=1, aspect='auto')
cb = plt.colorbar(im, orientation='horizontal')
cb.set_label('Squared correlation')
plt.xlabel('Learned factor')
_ = plt.xticks(np.arange(latent.shape[1]), np.arange(1, latent.shape[1] + 1))
plt.ylabel('Covariate')
_ = plt.yticks(np.arange(corr_zifa.shape[0]), corr_zifa.index)
ZIFA does not constrain factors to be orthogonal, so we would not expect it to get the same result as PPCA. However, the latent factor inferred by ZIFA correlated with sequencing depth still is highly correlated with PC1 inferred by PPCA, suggesting that it is not immune to whatever is biasing PPCA.
plt.clf()
plt.gcf().set_size_inches(12, 12)
im = plt.imshow(np.tril(np.corrcoef(latent.T, loadings.T)), cmap=colorcet.cm['coolwarm'], vmin=-1, vmax=1)
cb = plt.colorbar(im)
cb.set_label('Correlation')
labels = ['{} {}'.format(v, i) for v in ('Factor', 'PC') for i in range(10)]
_ = plt.xticks(np.arange(20), labels, rotation=90)
_ = plt.yticks(np.arange(20), labels)
Should we expect that the top principal component of log CPM is still sequencing depth? Hicks et al 2017 claim that correlation of the first principal component with detection rate can be explained by two facts:
- Centering the values of \(X\) does not center the values of \(log X\) (and vice versa)
- \(E[log X]\) depends on the gene detection rate
To see whether these facts can explain the correlation between PC1 and sequencing metrics we see in our data, we perform the following simulation:
- Generate un-normalized relative expression values for two groups of
samples, where one group has a true fold change in mean expression
\(β\):
\[ a(1)_j ∼ \mathrm{LogNormal}(0, 1) \]
\[ a(2)_0 = β a(1)_0 \]
\[ a(2)_j = a(1)_j,\ j ≠ 0 \]
- Compute the relative expression for each group \(p(k)_j = a(k)_j / ∑_j a(k)_j\)
- Sample a number of molecules \(R_i\) from the observed distribution of molecules
- Sample molecule counts \(xij ∼ \mathrm{Multinomial}(R_i, p)\)
- Normalize to log CPM using the
edgeR
definition - Fit probabilistic PCA (Tipping et al 1999) to explicitly account for mean
differences in the normalized data:
\[ p(x_i \mid z_i) ∼ N(W z_i + μ, σ^2 I) \]
\[ p(z_i) ∼ N(0, I) \]
where:
- \(x_i\) is a \(p × 1\) observation
- \(z_i\) is the associated \(k × 1\) latent variable, \(k \ll p\)
- \(W\) is the \(p × k\) matrix of loadings
- \(μ\) is the \(p × 1\) vector of gene-means
- Compute the squared correlation of loadings on each PC to \(R\)
class Simulation:
def __init__(self, num_genes, fold_change=2, seed=0):
np.random.seed(seed)
self.num_genes = num_genes
rel_expr_1 = np.flip(np.sort(np.random.lognormal(size=self.num_genes)), axis=-1)
rel_expr_2 = rel_expr_1.copy()
rel_expr_2[0] *= fold_change
self.rel_expr = [rel_expr_1, rel_expr_2]
for r in self.rel_expr:
r /= r.sum()
def generate_log_cpm(self, num_samples, library_sizes, detection_rates=None):
libsize = [np.random.choice(library_sizes, num_samples // 2) for _ in range(2)]
if detection_rates is not None:
det_rate = [np.random.choice(detection_rates, num_samples)]
counts = np.array([np.random.multinomial(n, p) for sizes, p in zip(libsize, self.rel_expr) for n in sizes]).reshape(num_samples, self.num_genes)
if detection_rates is not None:
mask = np.array([np.random.uniform() < d for d in det_rate for _ in self.rel_expr[0]]).reshape(num_samples, self.num_genes)
counts *= mask
total_counts = counts.sum(axis=1)
pseudocount = .5 * total_counts / total_counts.mean()
log_cpm = np.log(counts + pseudocount.reshape(-1, 1)) - np.log(total_counts + 2 * pseudocount).reshape(-1, 1) + 6 * np.log(10)
return log_cpm
def pca_corr(num_components=10, num_trials=10):
corr = []
for _ in range(num_trials):
ppca = skd.PCA(n_components=num_components)
loadings = ppca.fit_transform(log_cpm)
corr.append([st.pearsonr(x.ravel(), total_counts.ravel())[0] for x in loadings.T])
return corr
Look at a draw of the relative expression values:
sim = Simulation(num_genes=1000)
plt.clf()
plt.hist(sim.rel_expr[0], density=True, bins=50)
plt.xlabel('Relative expression')
plt.ylabel('Density')
Look at a draw of log CPM:
quantiles = np.linspace(0, 1, 9)[:-1]
simulated_log_cpm = sim.generate_log_cpm(num_samples=50, library_sizes=annotations['molecules'])
simulated_log_cpm = simulated_log_cpm[:,(quantiles * sim.num_genes).astype(int)]
jitter = np.random.normal(scale=.01, size=(50, 1)) + quantiles.reshape(1, -1)
plt.clf()
plt.scatter(jitter.ravel(), simulated_log_cpm.ravel())
plt.xlabel('Rank of relative gene expression')
plt.ylabel('log CPM')
Although the first principal component has non-zero correlation with library size, our simulation does not produce correlations comparable to the correlation we see in the actual data. This result suggests that explanation (1), not centering the data, does not fully explain the correlation.
corr = simulate_pca_log_cpm(500, 1000, annotations['molecules'], fold_change=2, num_trials=20, seed=1)
plt.clf()
plt.boxplot(np.square(corr))
plt.xlabel('Principal component')
plt.ylabel('Squared correlation with library size')
In the simulation, the correlation does not appear to depend on the true fold change (i.e., proportion of variance explained by biological factors). This result could be explained if the proportion of variance explained by the true fold change were small compared to the proportion of variance explained by library size.
corr_vs_fold_change = [
simulate_pca_log_cpm(500, 1000, annotations['molecules'], fold_change=fold_change, num_components=1, num_trials=20, seed=0)
for fold_change in np.linspace(1.1, 2, 10)]
plt.clf()
plt.boxplot(np.square(np.array(corr_vs_fold_change).mean(axis=-1)).T, positions=np.arange(10))
plt.xticks(np.arange(10), ['{:.3f}'.format(x) for x in np.linspace(1.1, 2, 10)])
plt.xlabel('True fold change')
plt.ylabel('Squared correlation with PC1')
If (not) centering the data does not explain the correlation, does dropout explain the correlation? We simulated dropout after drawing molecule counts \(xij\) as follows:
- Draw gene detection rates \(q_i\) from the observed gene detection rates
- Draw \(hij ∼ Bernoulli(q_i)\) iid.
- Use \(X^* = X ˆ H\) as the observed count matrix
Surprisingly, the correlation with the first PC goes away, likely because this dropout model is not correct.
corr_with_dropout = simulate_pca_log_cpm(500, 10000, annotations['molecules'], detection_rates=annotations['detect_hs'] / 2e4, fold_change=2, num_trials=20, seed=1)
plt.clf()
_ = plt.boxplot(np.square(corr_with_dropout))
plt.xlabel('Principal component')
plt.ylabel('Squared correlation with library size')
Hicks et al also claim that the entire distribution of non-zero measurements depends on detection rate. They show this by plotting the percentiles of non-zero expressed genes in each cell versus detection rate in that cell.
def plot_quantiles_vs_detection(umi, annotations, quantiles=None, pseudocount=None):
if quantiles is None:
quantiles = np.linspace(0, 1, 5)
else:
assert (quantiles >= 0).all()
assert (quantiles <= 1).all()
vals = np.nanpercentile(np.ma.masked_equal(umi.values, 0).astype(float).filled(np.nan), 100 * quantiles, axis=0, interpolation='higher')
if pseudocount is None:
# log CPM with per-cell pseudocount
total_counts = umi.sum()
pseudocount = .5 * total_counts / total_counts.mean()
label = 'log CPM'
vals = np.log(vals + pseudocount.values.reshape(1, -1)) - np.log(total_counts + 2 * pseudocount).values.reshape(1, -1) + 6 * np.log(10)
else:
vals = np.log(vals + pseudocount)
label = '$\log({} + {:.3g})$'.format('\mathrm{UMI}', pseudocount)
plt.clf()
plt.gcf().set_size_inches(4, 4)
for q, v in zip(quantiles, vals):
plt.scatter(annotations['detect_hs'] / keep_genes.shape[0], v, c=colorcet.cm['inferno'](q), label='{:.2f}'.format(.9 * q), s=2)
plt.legend(title='Quantile', frameon=False, fancybox=False,
bbox_to_anchor=(.5, -.35), loc='lower center', markerscale=4, ncol=5,
columnspacing=1, handletextpad=0)
plt.xlabel('Proportion of genes detected')
plt.ylabel(label)
We recapitulate the main result of Hicks et al in our data.
%config InlineBackend.figure_formats = set(['retina'])
plot_quantiles_vs_detection(umi, annotations)
log CPM as defined in edgeR
uses a pseudocount which depends on library
size, but the derivation in Hicks et al is for \(log(X + ε)\) where
\(ε\) is constant across cells.
Using constant \(ε\) changes the shape of the relationship between quantiles of non-zero expression and detection rate, but does not remove the relationship.
plot_quantiles_vs_detection(umi, annotations, pseudocount=1)
plot_quantiles_vs_detection(umi, annotations, pseudocount=1e-3)
Bi-center the data, by fitting a model where observations depend on a row-mean and a column-mean and then subtracting the means from each entry.
\[ xij ∼ N(u_i + v_j, σ^2) \]
def sample_feature_means(obs, max_iters=10):
"""Fit per-feature and per-sample means
x_ij ~ N(u_i + v_j, sigma^2)
Inputs:
x - ndarray (num_samples, num_features)
Returns:
sample_means - ndarray (num_samples, 1)
feature_means - ndarray (num_features, 1)
"""
n, p = obs.shape
sample_means = np.zeros((n, 1))
feature_means = np.zeros((1, p))
resid = obs.var()
llik = -.5 * np.sum(np.log(2 * np.pi * resid) + (obs - feature_means - sample_means) / resid)
# Coordinate ascent on llik
for _ in range(max_iters):
sample_means = np.mean(obs - feature_means, axis=1, keepdims=True)
feature_means = np.mean(obs - sample_means, axis=0, keepdims=True)
# By default, np divides by N, not N - 1
resid = np.var(obs - feature_means - sample_means)
update = -.5 * np.sum(np.log(2 * np.pi * resid) + (obs - feature_means - sample_means) / resid)
print(update)
if np.isclose(update, llik):
return sample_means, feature_means.T
else:
llik = update
raise ValueError("Failed to converge")
def plot_bicentered_pca(log_cpm, annotations, cont_covars, cat_covars):
sample_means, feature_means = sample_feature_means(log_cpm.values.T)
ppca = skd.PCA(n_components=10)
loadings = ppca.fit_transform(log_cpm.values.T - sample_means - feature_means.T)
corr = pd.concat(
[correlation(pd.DataFrame(loadings), cont_covars)] +
[categorical_r2(loadings, annotations, k, n) for k, n in cat_covars])
corr = corr.pivot(index='covar', columns='pc')
plot_pca_covar_corr(ppca, corr)
%config InlineBackend.figure_formats = set(['svg'])
plot_bicentered_pca(log_cpm, annotations, cont_covars, cat_covars)
The dependency of non-zero gene expression on gene detection rate is non-linear, so use kernel PCA to perform non-linear dimensionality reduction (Schölkopf et al 1998). The basic idea is to non-linearly map the original points into a different space, and perform PCA in that space.
The method eliminates the second technical PC by accurately modeling the non-linearity in the data, but it fails to eliminate the first technical PC because it does not include sample-specific mean parameters.
It is non-trivial to add such parameters because we have to center the projections of the samples, and the key algorithmic trick used is that we never have to actually compute the projections. In particular, we assume the radial basis function kernel, which projects the data into infinite dimensional space, making it impossible to compute the projections.
kpca = skd.KernelPCA(kernel='rbf', n_components=10)
loadings_kpca = kpca.fit_transform(log_cpm.values.T)
corr_kpca = pd.concat(
[correlation(pd.DataFrame(loadings_kpca), cont_covars)] +
[categorical_r2(loadings_kpca, annotations, k, n) for k, n in cat_covars])
corr_kpca = corr_kpca.pivot(index='covar', columns='pc')
%config InlineBackend.figure_formats = set(['svg'])
plt.clf()
plt.gcf().set_size_inches(8, 12)
im = plt.imshow(corr_kpca.values, cmap=colorcet.cm['fire'], vmin=0, vmax=1, aspect='auto')
cb = plt.colorbar(im, orientation='horizontal')
cb.set_label('Squared correlation')
plt.gca().set_xlabel('Principal component')
plt.gca().set_yticks(np.arange(corr_kpca.shape[0]))
plt.gca().set_yticklabels(corr_kpca.index)
plt.gca().set_ylabel('Covariate')
plt.gcf().tight_layout()
Although the dependency of the percentiles of non-zero gene expression on detection rate appears to be nonlinear, we can partially correct for it by regressing out the percentiles of expression from the expression values for each gene.
\[ yij = p_i β + μ_j + εij \]
\[ ˜{y}ij = yij - p_i \hat\beta - \hat\mu_j \]
normalized_percentiles = np.percentile(log_cpm, 100 * np.linspace(0, 1, 5), axis=0)
log_cpm_residuals = log_cpm.apply(lambda y: y - sklm.LinearRegression(fit_intercept=True).fit(normalized_percentiles.T, y).predict(normalized_percentiles.T), axis=1)
%config InlineBackend.figure_formats = set(['svg'])
plot_bicentered_pca(log_cpm_residuals, annotations, cont_covars, cat_covars)
Regression against the percentiles of gene expression seems like an inelegant way of performing quantile normalization. However, quantile normalizing doesn’t work.
import rpy2.robjects
import rpy2.robjects.numpy2ri
numpy2ri = rpy2.robjects.numpy2ri.numpy2ri
def qqnorm(x):
"""Wrap around R qqnorm"""
return np.asarray(rpy2.robjects.r['qqnorm'](numpy2ri(x))[0])
normed = log_cpm.apply(qqnorm, axis=0)
%config InlineBackend.figure_formats = set(['svg'])
plot_bicentered_pca(normed, annotations, cont_covars, cat_covars)