Here, we investigate the correlation between single cell RNA-Seq and bulk RNA-Seq on samples from the same cell type in the same individuals. Our goal is to qualitatively assess our ability to call QTLs from scRNA-Seq.
(org-babel-lob-ingest "/home/aksarkar/.emacs.d/org-templates/library.org")
%config InlineBackend.figure_formats = set(['svg'])
import colorcet
import itertools as it
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.special as sp
import scipy.stats as st
import sqlite3
def plot_concordance(x, y, title, filename, xlabel=None, ylabel=None, lim=None, **kwargs):
"""Plot hexbin of concordance"""
merged = x.merge(y, left_index=True, right_index=True)
merged.columns = ['x', 'y']
if lim is None:
lim = [merged.min().min(), merged.max().max()]
plt.clf()
if 'gridsize' not in kwargs:
kwargs['gridsize'] = 40
hexes = plt.hexbin(merged['x'], merged['y'], cmap=colorcet.cm['blues'], extent=lim + lim, **kwargs)
ax = plt.gca()
if lim is None:
ax.set_xlim([merged['x'].min(), merged['x'].max()])
ax.set_ylim([merged['y'].min(), merged['y'].max()])
else:
ax.set_xlim(lim)
ax.set_ylim(lim)
ax.set_aspect('equal')
cb = plt.colorbar()
cb.set_label('Number of genes')
plt.plot(lim, lim, color='red')
plt.title(title)
if xlabel is None:
xlabel = 'scRNA-Seq $\log_2(\mathrm{CPM} + 1) $'
if ylabel is None:
ylabel = 'Bulk RNA-Seq $\log_2(\mathrm{TPM} + 1)$'
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.savefig(filename)
def cpm(counts, size=None, log2=False):
if size is None:
size = counts.sum(axis=0)
cpm = counts / size * 1e6
if log2:
cpm = np.log(cpm + 1) / np.log(2)
return cpm
def plot_concordance_by_individual(umi, annotations, bulk, output_dir):
bulk, pooled_cpm = bulk.align(
cpm(umi.groupby(by=annotations['chip_id'].values, axis=1).agg(np.sum),
size=annotations.groupby('chip_id')['mol_hs'].agg(np.sum), log2=True),
axis=1, join='inner')
for k in bulk:
plot_concordance(
x=pooled_cpm[k].to_frame(),
y=bulk[k].to_frame(),
title=k,
filename='{}/{}.svg'.format(output_dir, k))
def plot_concordance_by_num_cells(individual, umi, annotations, bulk_tpm, output_dir):
bulk_tpm = bulk_tpm[individual].to_frame()
umi = umi.loc[:,(annotations['chip_id'] == individual).values]
annotations = annotations[annotations['chip_id'] == individual]
for num_cells in [1, 10, 50, 100, 200]:
sample = np.random.choice(annotations.shape[0], size=num_cells)
pooled_cpm = cpm(umi.iloc[:,sample].sum(axis=1).to_frame(),
size=annotations.iloc[sample]['mol_hs'].agg(np.sum), log2=True)
plot_concordance(
x=pooled_cpm,
y=bulk_tpm,
title='{}, {} cell{}'.format(individual, num_cells, 's' if num_cells > 1 else ''),
filename='{}/{}-{}.svg'.format(output_dir, individual, num_cells),
gridsize=20)
def plot_concordance_pooled_subsets(individual, umi, annotations, output_dir):
umi = umi.loc[:,(annotations['chip_id'] == individual).values]
for num_cells in [1, 10, 50, 100]:
sample = umi.sample(n=2 * num_cells, axis=1)
pool1 = cpm(sample.iloc[:,:num_cells].sum(axis=1).to_frame(), log2=True)
pool2 = cpm(sample.iloc[:,num_cells:].sum(axis=1).to_frame(), log2=True)
plot_concordance(
x=pool1,
y=pool2,
title='{}, {} cell{}'.format(individual, num_cells, 's' if num_cells > 1 else ''),
filename='{}/{}-{}.svg'.format(output_dir, individual, num_cells),
ylabel='scRNA-Seq $\log_2(\mathrm{CPM} + 1)$',
gridsize=15)
def plot_concordance_rho(bulk, sc, output_dir, **kwargs):
if 'xlabel' not in kwargs:
kwargs['xlabel'] = 'Single cell ln relative abundance'
if 'ylabel' not in kwargs:
kwargs['ylabel'] = 'Bulk ln relative abundance'
bulk, sc = bulk.align(sc, axis=1, join='inner')
for k in bulk:
y = bulk[k].dropna().to_frame()
x = sc[k].dropna().to_frame()
plot_concordance(
x=x,
y=y,
title=k,
gridsize=20,
**kwargs,
filename='{}/{}.svg'.format(output_dir, k))
def mask(df):
return ~np.isfinite(df)
Read the QC files.
annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)
keep_genes = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/genes-pass-filter.txt', index_col=0, header=None)
annotations = annotations.loc[keep_samples.values.ravel()]
Read the UMI matrix.
umi = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz', index_col=0)
umi = umi.loc[:,keep_samples.values.ravel()]
The only quantity which is directly comparable between bulk and scRNA-Seq is
relative abundance (Pachter 2011). Therefore, we re-processed the iPSC bulk
RNA-Seq using kallisto
.
Important: we need to quantify relative abundance with respect to exactly the same set of genes as the single cell data.
bulk_tpm = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/kallisto/bulk-ipsc-tpm.txt.gz', header=None, sep=' ').pivot(columns=0, index=1, values=2)
bulk_log_rho = np.log(bulk_tpm) - np.log(bulk_tpm.sum(axis=0))
Pool the single cells and estimate log CPM. We assume this is proportional to relative abundance (we assume UMIs really do directly count molecules).
sc_log_cpm = np.log(umi.groupby(annotations['chip_id'].values, axis=1).agg(np.sum)) - np.log(annotations.groupby('chip_id')['mol_hs'].agg(np.sum)) + 6 * np.log(10)
sc_log_rho = sc_log_cpm - sp.logsumexp(sc_log_cpm, axis=0)
S, T = (bulk_log_rho.loc[keep_genes.values.ravel()]
.mask(mask)
.align(sc_log_rho.loc[keep_genes.values.ravel()]
.mask(mask), join='inner'))
plot_concordance_rho(
S,
T,
'/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/pooled/')
Plot the individual with the most cells, and the fewest cells.
file:figure/sc-vs-bulk.org/pooled/NA18507.svg file:figure/sc-vs-bulk.org/pooled/NA18522.svg
Look at the distribution of absolute differences.
S, T = bulk_log_rho.align(sc_log_rho, join='inner')
diff = abs(S - T)
np.nanpercentile(diff.mask(~np.isfinite(diff)), [90, 95, 99, 99.5, 99.9])
Look at pathway enrichment for genes only detected in bulk.
bulk_only = set([x for k in T for x in pd.Series(T[k].mask(np.isfinite(S[k])).dropna().index)])
Annotation Cluster 1 Enrichment Score: 6.163250777011418 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR018064:Metallothionein, vertebrate, metal binding site 6 5.9405940594059405 5.214809864836233E-10 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 11 18559 128.14039125431532 8.343695168111509E-8 8.343695168111509E-8 6.296819954343391E-7 UP_SEQ_FEATURE region of interest:Alpha 6 5.9405940594059405 1.0452172113872762E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 80 13 20063 115.74807692307692 2.4144515486934637E-7 2.4144515486934637E-7 1.3409402699338102E-6 UP_SEQ_FEATURE region of interest:Beta 6 5.9405940594059405 1.0452172113872762E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 80 13 20063 115.74807692307692 2.4144515486934637E-7 2.4144515486934637E-7 1.3409402699338102E-6 UP_SEQ_FEATURE metal ion-binding site:Divalent metal cation; cluster A 6 5.9405940594059405 1.6208966909643656E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 80 14 20063 107.48035714285714 3.744270538064143E-7 1.8721354444473093E-7 2.0794964750159295E-6 UP_SEQ_FEATURE metal ion-binding site:Divalent metal cation; cluster B 6 5.9405940594059405 1.6208966909643656E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 80 14 20063 107.48035714285714 3.744270538064143E-7 1.8721354444473093E-7 2.0794964750159295E-6 INTERPRO IPR003019:Metallothionein superfamily, eukaryotic 6 5.9405940594059405 2.2376038114295356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 14 18559 100.68173598553346 3.580165393035628E-7 1.7900828563899296E-7 2.7018795867306267E-6 INTERPRO IPR000006:Metallothionein, vertebrate 6 5.9405940594059405 2.2376038114295356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 14 18559 100.68173598553346 3.580165393035628E-7 1.7900828563899296E-7 2.7018795867306267E-6 INTERPRO IPR023587:Metallothionein domain, vertebrate 6 5.9405940594059405 2.2376038114295356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 14 18559 100.68173598553346 3.580165393035628E-7 1.7900828563899296E-7 2.7018795867306267E-6 INTERPRO IPR017854:Metallothionein domain 6 5.9405940594059405 2.2376038114295356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 14 18559 100.68173598553346 3.580165393035628E-7 1.7900828563899296E-7 2.7018795867306267E-6 UP_KEYWORDS Metal-thiolate cluster 6 5.9405940594059405 3.848645320957292E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 97 14 20581 90.93225331369662 5.272642669140737E-7 5.272642669140737E-7 4.523761099051171E-6 GOTERM_BP_DIRECT GO:0045926~negative regulation of growth 6 5.9405940594059405 1.6056085205036586E-8 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 75 19 16792 70.70315789473685 6.727477139589766E-6 6.727477139589766E-6 2.2546318878546856E-5 GOTERM_BP_DIRECT GO:0071294~cellular response to zinc ion 6 5.9405940594059405 1.6056085205036586E-8 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 75 19 16792 70.70315789473685 6.727477139589766E-6 6.727477139589766E-6 2.2546318878546856E-5 UP_KEYWORDS Cadmium 5 4.9504950495049505 5.5012039998665416E-8 ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417 97 9 20581 117.87514318442153 7.536621291603929E-6 3.7683177458447403E-6 6.466204057753444E-5 KEGG_PATHWAY hsa04978:Mineral absorption 6 5.9405940594059405 6.197865698893864E-7 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 27 46 6910 33.38164251207729 3.1608625294388126E-5 3.1608625294388126E-5 5.995777879075348E-4 GOTERM_BP_DIRECT GO:0071276~cellular response to cadmium ion 5 4.9504950495049505 7.917968157631074E-7 ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417 75 17 16792 65.85098039215686 3.317079698986758E-4 1.6586774100302293E-4 0.0011118534141041359 UP_KEYWORDS Copper 5 4.9504950495049505 2.4203238770349687E-4 ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417 97 65 20581 16.321173671689134 0.03261860473945333 0.008256340267242868 0.2841189120846299 GOTERM_MF_DIRECT GO:0046872~metal ion binding 13 12.871287128712872 0.12273080737129344 ENSG00000152977, ENSG00000125144, ENSG00000169715, ENSG00000153266, ENSG00000198105, ENSG00000125148, ENSG00000177932, ENSG00000215397, ENSG00000259332, ENSG00000205358, ENSG00000187193, ENSG00000121691, ENSG00000198417 69 2069 16881 1.5372055393279678 0.9999998290722704 0.7894858696046019 77.70573642627673 GOTERM_MF_DIRECT GO:0008270~zinc ion binding 8 7.920792079207921 0.18975529795362245 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417 69 1169 16881 1.6742663740841297 0.9999999999866547 0.8758984183433024 91.03468733803763 GOTERM_CC_DIRECT GO:0048471~perinuclear region of cytoplasm 6 5.9405940594059405 0.19285594142887852 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 91 621 18224 1.934915326219674 0.9999999919695161 0.9759592316379472 90.12891148150037 UP_KEYWORDS Zinc 13 12.871287128712872 0.41448076703181924 ENSG00000152977, ENSG00000125144, ENSG00000169715, ENSG00000153266, ENSG00000198105, ENSG00000125148, ENSG00000177932, ENSG00000215397, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417 97 2348 20581 1.1747352429793287 1.0 0.9829889947051305 99.81480020819157 UP_KEYWORDS Metal-binding 15 14.85148514851485 0.8237195669856142 ENSG00000125144, ENSG00000198105, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417, ENSG00000152977, ENSG00000169715, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000259332, ENSG00000121691, ENSG00000165188, ENSG00000247746 97 3640 20581 0.8743485895547751 1.0 0.9992576564560991 99.99999986203825 Annotation Cluster 2 Enrichment Score: 3.484819795455531 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR001152:Thymosin beta-4 4 3.9603960396039604 7.098533109747709E-7 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 79 5 18559 187.93924050632913 1.1357012050017268E-4 3.785814005408117E-5 8.571359595421768E-4 SMART SM00152:THY 4 3.9603960396039604 7.766657663469284E-7 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 45 5 10057 178.79111111111112 2.873623162158445E-5 2.873623162158445E-5 6.973781416896863E-4 PIR_SUPERFAMILY PIRSF001828:thymosin beta 4 3.9603960396039604 2.7080998708248916E-6 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 13 5 1692 104.12307692307694 2.708066868917225E-5 2.708066868917225E-5 0.0016261217555713081 GOTERM_BP_DIRECT GO:0042989~sequestering of actin monomers 4 3.9603960396039604 9.642416279102543E-6 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 75 10 16792 89.55733333333333 0.004032041304637524 0.0013458241984475316 0.013539249589888946 GOTERM_MF_DIRECT GO:0003785~actin monomer binding 4 3.9603960396039604 1.5210285357647118E-4 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 69 26 16881 37.63879598662207 0.017938766106036064 0.017938766106036064 0.1742005540888658 GOTERM_CC_DIRECT GO:0031941~filamentous actin 4 3.9603960396039604 4.7367184906870326E-4 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 91 31 18224 25.840482098546616 0.04038125481188837 0.04038125481188837 0.5107377212681841 GOTERM_BP_DIRECT GO:0007015~actin filament organization 4 3.9603960396039604 0.0039404588803159685 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 75 72 16792 12.43851851851852 0.8087767706624795 0.21048033854212067 5.393322740309592 UP_KEYWORDS Actin-binding 5 4.9504950495049505 0.0394339844994075 ENSG00000034510, ENSG00000147481, ENSG00000205542, ENSG00000154620, ENSG00000158164 97 274 20581 3.871811272480999 0.9959614191508189 0.5449776981781902 37.680777033965654 GOTERM_BP_DIRECT GO:0030036~actin cytoskeleton organization 3 2.9702970297029703 0.11220693889642849 ENSG00000034510, ENSG00000205542, ENSG00000158164 75 130 16792 5.166769230769231 1.0 0.9373656872217728 81.19896543130287 UP_KEYWORDS Cytoskeleton 5 4.9504950495049505 0.7843259865566969 ENSG00000034510, ENSG00000147481, ENSG00000205542, ENSG00000154620, ENSG00000158164 97 1138 20581 0.9322287246571123 1.0 0.9985945289079377 99.99999852310495 Annotation Cluster 3 Enrichment Score: 1.7811609652286242 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR SMART SM00389:HOX 8 7.920792079207921 9.417302516012567E-5 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 45 250 10057 7.151644444444445 0.003478501955001434 0.0017407661108270744 0.08452766486343188 INTERPRO IPR001356:Homeodomain 8 7.920792079207921 1.0044839918367324E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 79 256 18559 7.341376582278481 0.015944076888024683 0.0040100758644506795 0.12122279079956888 UP_SEQ_FEATURE DNA-binding region:Homeobox 7 6.9306930693069315 1.0745524652182817E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093 80 191 20063 9.191164921465969 0.02451792636356398 0.008240359123208085 0.13776994804843845 UP_KEYWORDS Homeobox 8 7.920792079207921 2.273035742769636E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 97 262 20581 6.478633823876604 0.030664146783657475 0.010327675965698613 0.26685006254048016 GOTERM_BP_DIRECT GO:0006366~transcription from RNA polymerase II promoter 10 9.900990099009901 4.090749986540058E-4 ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000054598 75 513 16792 4.364392462638077 0.15754707376630528 0.028168569886196426 0.5729025901226814 INTERPRO IPR009057:Homeodomain-like 8 7.920792079207921 5.261429021979469E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 79 336 18559 5.59342977697408 0.08075722475289715 0.01669998667646322 0.6334635185275994 GOTERM_MF_DIRECT GO:0001077~transcriptional activator activity, RNA polymerase II core promoter proximal region sequence-specific binding 6 5.9405940594059405 0.002608319419483337 ENSG00000152977, ENSG00000129514, ENSG00000109132, ENSG00000119547, ENSG00000205922, ENSG00000164093 69 236 16881 6.219970523212969 0.2671362793302654 0.1439254000557343 2.949180047116129 UP_SEQ_FEATURE compositionally biased region:Poly-Ala 7 6.9306930693069315 0.00518825052735661 ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000138083, ENSG00000054598, ENSG00000171450, ENSG00000163508 80 404 20063 4.3453279702970296 0.6992890245264455 0.25947910072369085 6.455682440417276 UP_KEYWORDS DNA-binding 19 18.81188118811881 0.0061751724814120695 ENSG00000129514, ENSG00000184302, ENSG00000164853, ENSG00000067048, ENSG00000198105, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000152977, ENSG00000109132, ENSG00000188375, ENSG00000153266, ENSG00000119547, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2050 20581 1.9665023887352275 0.5719954819022799 0.15610255752991076 7.022162963087153 GOTERM_BP_DIRECT GO:0007420~brain development 5 4.9504950495049505 0.00987285598589564 ENSG00000152977, ENSG00000053438, ENSG00000138083, ENSG00000054598, ENSG00000163508 75 190 16792 5.891929824561403 0.9843499743231441 0.36992744676798917 13.00554505004673 GOTERM_BP_DIRECT GO:0045944~positive regulation of transcription from RNA polymerase II promoter 11 10.891089108910892 0.01060386709098585 ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508 75 981 16792 2.5105266734624534 0.9885153598169162 0.36024787175300976 13.90312024253032 UP_SEQ_FEATURE compositionally biased region:Poly-Gly 5 4.9504950495049505 0.028203027199206778 ENSG00000109132, ENSG00000119547, ENSG00000138083, ENSG00000054598, ENSG00000163508 80 292 20063 4.294306506849315 0.9986511918732182 0.7333202527794147 30.72078907690975 GOTERM_MF_DIRECT GO:0000981~RNA polymerase II transcription factor activity, sequence-specific DNA binding 4 3.9603960396039604 0.03168704614824783 ENSG00000129514, ENSG00000184302, ENSG00000138083, ENSG00000054598 69 171 16881 5.722857869310959 0.9783292834625844 0.7212010778222342 30.862537573604364 UP_KEYWORDS Developmental protein 10 9.900990099009901 0.03297928048143198 ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000164853, ENSG00000053438, ENSG00000138083, ENSG00000164093, ENSG00000163508 97 949 20581 2.2357772152998816 0.9898911492926898 0.5350028435318056 32.57673990320743 GOTERM_MF_DIRECT GO:0003677~DNA binding 13 12.871287128712872 0.03398271598877738 ENSG00000184302, ENSG00000129514, ENSG00000164853, ENSG00000067048, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000163508 69 1674 16881 1.8999272765051167 0.9836618555842146 0.6424797192945351 32.718143656703056 INTERPRO IPR017970:Homeobox, conserved site 4 3.9603960396039604 0.04596070918167841 ENSG00000109132, ENSG00000164853, ENSG00000170561, ENSG00000164093 79 190 18559 4.94576948700866 0.9994622235937994 0.7148323276503672 43.341501655162176 GOTERM_MF_DIRECT GO:0000977~RNA polymerase II regulatory region sequence-specific DNA binding 4 3.9603960396039604 0.05156656275239971 ENSG00000109132, ENSG00000205922, ENSG00000054598, ENSG00000163508 69 208 16881 4.704849498327759 0.9981642397951731 0.7163627995115522 45.49275378977236 GOTERM_MF_DIRECT GO:0000978~RNA polymerase II core promoter proximal region sequence-specific DNA binding 5 4.9504950495049505 0.054849155170871446 ENSG00000152977, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000164093 69 355 16881 3.445805266380894 0.9987848382314719 0.673332685025507 47.616359931448414 GOTERM_MF_DIRECT GO:0003700~transcription factor activity, sequence-specific DNA binding 7 6.9306930693069315 0.18959031696019613 ENSG00000152977, ENSG00000129514, ENSG00000177932, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508 69 961 16881 1.782065782925395 0.9999999999863274 0.8971154894254747 91.01374122380184 UP_KEYWORDS Activator 6 5.9405940594059405 0.19590645201033402 ENSG00000152977, ENSG00000129514, ENSG00000109132, ENSG00000119547, ENSG00000205922, ENSG00000163508 97 661 20581 1.9259478765382037 0.9999999999998936 0.8995206058743108 92.29171287120641 GOTERM_BP_DIRECT GO:0045893~positive regulation of transcription, DNA-templated 4 3.9603960396039604 0.3971154565879258 ENSG00000152977, ENSG00000143869, ENSG00000054598, ENSG00000163508 75 515 16792 1.7389773462783173 1.0 0.9999587761988841 99.91796526374138 UP_KEYWORDS Transcription regulation 13 12.871287128712872 0.40500487412843467 ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2332 20581 1.1827951760357907 1.0 0.9847649688241881 99.77633739079752 UP_KEYWORDS Transcription 13 12.871287128712872 0.44414461678326184 ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2398 20581 1.1502411803650807 1.0 0.9820944180997002 99.89948309755069 UP_KEYWORDS Nucleus 25 24.752475247524753 0.5820328536133491 ENSG00000164853, ENSG00000253626, ENSG00000253506, ENSG00000159182, ENSG00000152977, ENSG00000109132, ENSG00000188375, ENSG00000147481, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000129514, ENSG00000184302, ENSG00000067048, ENSG00000198105, ENSG00000205922, ENSG00000170561, ENSG00000138083, ENSG00000250254, ENSG00000164093, ENSG00000182195, ENSG00000119547, ENSG00000153266, ENSG00000163508 97 5244 20581 1.0115143865940062 1.0 0.9956273086940396 99.99647757087135 GOTERM_BP_DIRECT GO:0030154~cell differentiation 3 2.9702970297029703 0.6080833739283924 ENSG00000152977, ENSG00000119547, ENSG00000205922 75 462 16792 1.453852813852814 1.0 0.9999995136278385 99.99980612003067 UP_KEYWORDS Disease mutation 9 8.91089108910891 0.9204249405761661 ENSG00000152977, ENSG00000184302, ENSG00000109132, ENSG00000124172, ENSG00000138083, ENSG00000168878, ENSG00000164093, ENSG00000054598, ENSG00000248099 97 2550 20581 0.7488538508186781 1.0 0.9999149050236531 99.99999999998799 Annotation Cluster 4 Enrichment Score: 1.165498308323919 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_BP_DIRECT GO:0006413~translational initiation 6 5.9405940594059405 3.446046051185818E-4 ENSG00000067048, ENSG00000198692, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000205609 75 137 16792 9.805547445255476 0.1344708403894822 0.03545959319361591 0.4828157254203713 GOTERM_CC_DIRECT GO:0022625~cytosolic large ribosomal subunit 3 2.9702970297029703 0.04452488666483783 ENSG00000198918, ENSG00000229117, ENSG00000163923 91 68 18224 8.835164835164836 0.9809854506297719 0.8621067464658695 38.874961529405915 UP_KEYWORDS Ribosomal protein 4 3.9603960396039604 0.055666091991101146 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 97 185 20581 4.58757314015046 0.9996089745535879 0.6250048700776605 48.99391263132687 GOTERM_MF_DIRECT GO:0003735~structural constituent of ribosome 4 3.9603960396039604 0.06034723008605642 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 69 222 16881 4.408147277712495 0.9993930954418766 0.6038222785369594 51.00471832879968 GOTERM_BP_DIRECT GO:0006614~SRP-dependent cotranslational protein targeting to membrane 3 2.9702970297029703 0.06460194328297483 ENSG00000129824, ENSG00000198918, ENSG00000229117 75 94 16792 7.145531914893617 0.9999999999992961 0.8838033244845918 60.850545022072836 GOTERM_MF_DIRECT GO:0003723~RNA binding 6 5.9405940594059405 0.06925428571255729 ENSG00000067048, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923, ENSG00000178997 69 547 16881 2.6835704633971864 0.9998046135816214 0.6128518819827384 56.07177294869783 GOTERM_BP_DIRECT GO:0019083~viral transcription 3 2.9702970297029703 0.0874042522465415 ENSG00000129824, ENSG00000198918, ENSG00000229117 75 112 16792 5.997142857142857 1.0 0.922295808229385 72.31651234196252 KEGG_PATHWAY hsa03010:Ribosome 3 2.9702970297029703 0.09197141519874578 ENSG00000129824, ENSG00000198918, ENSG00000229117 27 136 6910 5.645424836601307 0.9927041560195942 0.8060490790853818 60.67613480771449 GOTERM_BP_DIRECT GO:0000184~nuclear-transcribed mRNA catabolic process, nonsense-mediated decay 3 2.9702970297029703 0.09683692108179212 ENSG00000129824, ENSG00000198918, ENSG00000229117 75 119 16792 5.64436974789916 1.0 0.9305572806891955 76.07463661943554 GOTERM_BP_DIRECT GO:0006412~translation 4 3.9603960396039604 0.10082618317693195 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 75 253 16792 3.539815546772069 1.0 0.927158013883527 77.51659768922529 UP_KEYWORDS Ribonucleoprotein 4 3.9603960396039604 0.16041445057926768 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 97 296 20581 2.867233212594037 0.9999999999604725 0.8641452261285151 87.1930135274217 GOTERM_BP_DIRECT GO:0006364~rRNA processing 3 2.9702970297029703 0.24311129519341956 ENSG00000129824, ENSG00000198918, ENSG00000229117 75 214 16792 3.138691588785047 1.0 0.9978504982777531 97.9985714847756 GOTERM_CC_DIRECT GO:0005829~cytosol 7 6.9306930693069315 0.9995612834920512 ENSG00000226784, ENSG00000125148, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000154620, ENSG00000121691 91 3315 18224 0.42287968441814594 1.0 1.0 100.0 Annotation Cluster 5 Enrichment Score: 0.820211096675127 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_MF_DIRECT GO:0043565~sequence-specific DNA binding 6 5.9405940594059405 0.0575036128572627 ENSG00000129514, ENSG00000164853, ENSG00000170561, ENSG00000164093, ENSG00000054598, ENSG00000163508 69 518 16881 2.833808964243747 0.999130471945484 0.6346115628640061 49.27807833492119 GOTERM_MF_DIRECT GO:0003700~transcription factor activity, sequence-specific DNA binding 7 6.9306930693069315 0.18959031696019613 ENSG00000152977, ENSG00000129514, ENSG00000177932, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508 69 961 16881 1.782065782925395 0.9999999999863274 0.8971154894254747 91.01374122380184 GOTERM_MF_DIRECT GO:0008134~transcription factor binding 3 2.9702970297029703 0.31758166829967777 ENSG00000129514, ENSG00000164093, ENSG00000054598 69 284 16881 2.5843539497856707 1.0 0.9697360578283208 98.74714487738314 Annotation Cluster 6 Enrichment Score: 0.508309881872388 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR002110:Ankyrin repeat 3 2.9702970297029703 0.29082448887491896 ENSG00000222038, ENSG00000163046, ENSG00000196834 79 255 18559 2.763812360387193 1.0 0.9989646799915811 98.42286358849252 SMART SM00248:ANK 3 2.9702970297029703 0.2975318758729139 ENSG00000222038, ENSG00000163046, ENSG00000196834 45 249 10057 2.692637215528782 0.9999978856103895 0.9871650256415595 95.80398991675298 INTERPRO IPR020683:Ankyrin repeat-containing domain 3 2.9702970297029703 0.30637168905164147 ENSG00000222038, ENSG00000163046, ENSG00000196834 79 265 18559 2.6595175543348457 1.0 0.9985017380546117 98.79322761229355 UP_KEYWORDS ANK repeat 3 2.9702970297029703 0.3494196849000245 ENSG00000222038, ENSG00000163046, ENSG00000196834 97 264 20581 2.411082474226804 1.0 0.9748006629928797 99.36098921844489 Annotation Cluster 7 Enrichment Score: 0.354336784895891 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_SEQ_FEATURE zinc finger region:C2H2-type 4 5 4.9504950495049505 0.20136843576824753 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 80 588 20063 2.132546768707483 1.0 0.9998260841655876 94.41306218498366 UP_SEQ_FEATURE zinc finger region:C2H2-type 3 5 4.9504950495049505 0.24137654936132147 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 80 636 20063 1.9715998427672958 1.0 0.9998901331440954 97.1105095588215 INTERPRO IPR013087:Zinc finger C2H2-type/integrase DNA-binding domain 5 4.9504950495049505 0.3509812706610015 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 79 712 18559 1.649747546579434 1.0 0.999008904105432 99.45921000508513 UP_SEQ_FEATURE zinc finger region:C2H2-type 5 4 3.9603960396039604 0.3687510325334959 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932 80 550 20063 1.823909090909091 1.0 0.9999925558095984 99.72665248783586 INTERPRO IPR015880:Zinc finger, C2H2-like 5 4.9504950495049505 0.3990477278010214 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 79 762 18559 1.5414963952290774 1.0 0.999393086210869 99.78644025284262 UP_KEYWORDS Transcription regulation 13 12.871287128712872 0.40500487412843467 ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2332 20581 1.1827951760357907 1.0 0.9847649688241881 99.77633739079752 UP_KEYWORDS Zinc 13 12.871287128712872 0.41448076703181924 ENSG00000152977, ENSG00000125144, ENSG00000169715, ENSG00000153266, ENSG00000198105, ENSG00000125148, ENSG00000177932, ENSG00000215397, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417 97 2348 20581 1.1747352429793287 1.0 0.9829889947051305 99.81480020819157 SMART SM00355:ZnF_C2H2 5 4.9504950495049505 0.4300934117849453 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 45 762 10057 1.4664625255176436 0.9999999990779891 0.994489588687009 99.35831314673327 INTERPRO IPR007087:Zinc finger, C2H2 5 4.9504950495049505 0.4343150143180009 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 79 799 18559 1.4701129576527623 1.0 0.9994976627786998 99.89711105742633 UP_SEQ_FEATURE zinc finger region:C2H2-type 2 4 3.9603960396039604 0.4378369016202355 ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 80 615 20063 1.631138211382114 1.0 0.9999983334439706 99.93821123573167 UP_KEYWORDS Transcription 13 12.871287128712872 0.44414461678326184 ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2398 20581 1.1502411803650807 1.0 0.9820944180997002 99.89948309755069 UP_SEQ_FEATURE zinc finger region:C2H2-type 6 3 2.9702970297029703 0.590458495161822 ENSG00000153266, ENSG00000198105, ENSG00000177932 80 501 20063 1.5017215568862274 1.0 0.9999999655860128 99.99893817056615 UP_SEQ_FEATURE zinc finger region:C2H2-type 1 3 2.9702970297029703 0.6455180669053127 ENSG00000153266, ENSG00000177932, ENSG00000215397 80 554 20063 1.3580550541516245 1.0 0.999999962991393 99.99983342537078 UP_KEYWORDS Zinc-finger 7 6.9306930693069315 0.8479589820363391 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397, ENSG00000165188, ENSG00000247746 97 1781 20581 0.8339285817651383 1.0 0.9994944279023904 99.99999997575446 GOTERM_BP_DIRECT GO:0006351~transcription, DNA-templated 5 4.9504950495049505 0.9785388522558911 ENSG00000164853, ENSG00000198105, ENSG00000177932, ENSG00000215397, ENSG00000163508 75 1955 16792 0.572617220801364 1.0 1.0 100.0 Annotation Cluster 8 Enrichment Score: 0.06711873232279976 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_CC_DIRECT GO:0005615~extracellular space 9 8.91089108910891 0.3473712675240062 ENSG00000165246, ENSG00000222038, ENSG00000198918, ENSG00000096088, ENSG00000196834, ENSG00000270136, ENSG00000143869, ENSG00000121691, ENSG00000168878 91 1347 18224 1.3380650529871019 0.9999999999999999 0.995027606334708 99.00691145271703 UP_SEQ_FEATURE disulfide bond 7 6.9306930693069315 0.9799768322289583 ENSG00000165246, ENSG00000180974, ENSG00000096088, ENSG00000143869, ENSG00000164241, ENSG00000168878, ENSG00000248099 80 2917 20063 0.6018212204319506 1.0 1.0 100.0 UP_KEYWORDS Disulfide bond 8 7.920792079207921 0.9979253382713639 ENSG00000165246, ENSG00000180974, ENSG00000096088, ENSG00000270136, ENSG00000143869, ENSG00000164241, ENSG00000168878, ENSG00000248099 97 3434 20581 0.4942929708374112 1.0 0.999999993214293 100.0 UP_SEQ_FEATURE signal peptide 6 5.9405940594059405 0.9982749748270936 ENSG00000179542, ENSG00000165246, ENSG00000096088, ENSG00000143869, ENSG00000168878, ENSG00000248099 80 3346 20063 0.4497086072922893 1.0 1.0 100.0 UP_KEYWORDS Signal 9 8.91089108910891 0.9995774779340586 ENSG00000179542, ENSG00000214194, ENSG00000165246, ENSG00000096088, ENSG00000270136, ENSG00000143869, ENSG00000168878, ENSG00000243317, ENSG00000248099 97 4160 20581 0.459033009516257 1.0 0.9999999998538799 100.0 UP_SEQ_FEATURE glycosylation site:N-linked (GlcNAc...) 5 4.9504950495049505 0.9999874126486624 ENSG00000179542, ENSG00000165246, ENSG00000180974, ENSG00000143869, ENSG00000168878 80 4234 20063 0.2961590694378838 1.0 1.0 100.0 UP_KEYWORDS Glycoprotein 5 4.9504950495049505 0.9999998667567829 ENSG00000179542, ENSG00000165246, ENSG00000180974, ENSG00000143869, ENSG00000168878 97 4551 20581 0.23310839126780789 1.0 1.0 100.0 Annotation Cluster 9 Enrichment Score: 0.014705889178132312 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_MF_DIRECT GO:0005524~ATP binding 4 3.9603960396039604 0.9471714074328096 ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142 69 1495 16881 0.6545877562890795 1.0 0.9999999988516262 99.99999999999977 UP_KEYWORDS ATP-binding 4 3.9603960396039604 0.9617258940215971 ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142 97 1391 20581 0.6101373335210891 1.0 0.9999922196380046 100.0 UP_KEYWORDS Nucleotide-binding 4 3.9603960396039604 0.9917505356146884 ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142 97 1788 20581 0.4746650061117646 1.0 0.9999997700488767 100.0 Annotation Cluster 10 Enrichment Score: 0.01422812959020817 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_SEQ_FEATURE transmembrane region 16 15.841584158415841 0.9235699422433608 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000180974, ENSG00000165246, ENSG00000165188, ENSG00000166002, ENSG00000205670 80 5056 20063 0.7936313291139241 1.0 0.9999999999999999 99.99999999999953 UP_KEYWORDS Transmembrane helix 20 19.801980198019802 0.9664300899010495 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165246, ENSG00000270136, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670 97 5634 20581 0.7531958030953453 1.0 0.9999910591386459 100.0 UP_KEYWORDS Transmembrane 20 19.801980198019802 0.9677227406188967 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165246, ENSG00000270136, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670 97 5651 20581 0.7509299512721953 1.0 0.9999895880346041 100.0 UP_SEQ_FEATURE topological domain:Extracellular 7 6.9306930693069315 0.9713553106975322 ENSG00000171840, ENSG00000179542, ENSG00000165246, ENSG00000198133, ENSG00000180974, ENSG00000086159, ENSG00000180638 80 2787 20063 0.6298932543954072 1.0 1.0 100.0 UP_SEQ_FEATURE topological domain:Cytoplasmic 9 8.91089108910891 0.9727885778326631 ENSG00000171840, ENSG00000179542, ENSG00000165246, ENSG00000198133, ENSG00000180974, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142 80 3456 20063 0.6530924479166668 1.0 1.0 100.0 GOTERM_CC_DIRECT GO:0016021~integral component of membrane 18 17.82178217821782 0.9855047988667748 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670 91 5163 18224 0.6981885052774071 1.0 0.9999999999999534 100.0 UP_KEYWORDS Membrane 26 25.742574257425744 0.9884542831964291 ENSG00000171840, ENSG00000198133, ENSG00000170091, ENSG00000253626, ENSG00000180638, ENSG00000197142, ENSG00000228474, ENSG00000179542, ENSG00000180389, ENSG00000124172, ENSG00000110934, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000176533, ENSG00000127540, ENSG00000139370, ENSG00000205639, ENSG00000086159, ENSG00000178449, ENSG00000214194, ENSG00000165246, ENSG00000180974, ENSG00000270136, ENSG00000171450, ENSG00000205670 97 7494 20581 0.7361297973086373 1.0 0.9999995215886112 100.0
Look at genes only detected in single cell.
set(x for k in S for x in pd.Series(S[k].mask(np.isfinite(T[k])).dropna().index))
Look at pathway enrichments for genes detected only in single cell.
Annotation Cluster 1 Enrichment Score: 0.16470559032620852 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_BP_DIRECT GO:0045944~positive regulation of transcription from RNA polymerase II promoter 3 13.043478260869565 0.3277257959734531 ENSG00000111087, ENSG00000184302, ENSG00000120690 21 981 16792 2.4453181884374544 1.0 0.9999999999888005 98.99989403807203 UP_KEYWORDS DNA-binding 3 13.043478260869565 0.6331696404915614 ENSG00000111087, ENSG00000184302, ENSG00000120690 22 2050 20581 1.3690243902439023 1.0 0.9988514746581159 99.99771774457302 GOTERM_CC_DIRECT GO:0005654~nucleoplasm 3 13.043478260869565 0.852891320214584 ENSG00000111087, ENSG00000120690, ENSG00000127080 22 2784 18224 0.8926332288401253 1.0 0.9999966100403458 99.99999864269819 GOTERM_CC_DIRECT GO:0005634~nucleus 5 21.73913043478261 0.9101247789696545 ENSG00000111087, ENSG00000184302, ENSG00000109618, ENSG00000120690, ENSG00000127080 22 5415 18224 0.7648787039368757 1.0 0.9999990374355104 99.99999998711735 UP_KEYWORDS Nucleus 4 17.391304347826086 0.9320660841557977 ENSG00000111087, ENSG00000184302, ENSG00000120690, ENSG00000127080 22 5244 20581 0.7135774218154081 1.0 0.9999972758163257 99.99999999996426 Annotation Cluster 2 Enrichment Score: 0.13876944094888757 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_KEYWORDS Membrane 10 43.47826086956522 0.3432466422296966 ENSG00000137502, ENSG00000102743, ENSG00000050438, ENSG00000099617, ENSG00000128283, ENSG00000149541, ENSG00000162188, ENSG00000197818, ENSG00000185818, ENSG00000213380 22 7494 20581 1.2483319989324793 0.9999999999999983 0.9772688861660149 98.86760814991567 UP_SEQ_FEATURE transmembrane region 5 21.73913043478261 0.8140386056324314 ENSG00000102743, ENSG00000050438, ENSG00000149541, ENSG00000197818, ENSG00000185818 22 5056 20063 0.9018537830840045 1.0 0.9999999999999997 99.99999861987743 UP_KEYWORDS Transmembrane helix 5 21.73913043478261 0.8670386049310165 ENSG00000102743, ENSG00000050438, ENSG00000149541, ENSG00000197818, ENSG00000185818 22 5634 20581 0.8302271920482783 1.0 0.99999148614062 99.99999995415865 UP_KEYWORDS Transmembrane 5 21.73913043478261 0.8687936796993484 ENSG00000102743, ENSG00000050438, ENSG00000149541, ENSG00000197818, ENSG00000185818 22 5651 20581 0.8277296053795788 1.0 0.9999827433821622 99.99999996021165 GOTERM_CC_DIRECT GO:0016021~integral component of membrane 4 17.391304347826086 0.9614936232995246 ENSG00000102743, ENSG00000149541, ENSG00000197818, ENSG00000185818 22 5163 18224 0.6417692321236773 1.0 0.9999999410477701 99.99999999999572 Annotation Cluster 3 Enrichment Score: 0.10814451882692515 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_CC_DIRECT GO:0005576~extracellular region 3 13.043478260869565 0.5650202328331935 ENSG00000099617, ENSG00000143869, ENSG00000173401 22 1610 18224 1.5435347261434218 1.0 0.9995280289945352 99.96172821785414 UP_SEQ_FEATURE signal peptide 3 13.043478260869565 0.8873328320761085 ENSG00000099617, ENSG00000143869, ENSG00000173401 22 3346 20063 0.8176520132587077 1.0 0.9999999999999631 99.99999999371153 UP_KEYWORDS Signal 3 13.043478260869565 0.9449653686711603 ENSG00000099617, ENSG00000143869, ENSG00000173401 22 4160 20581 0.674639423076923 1.0 0.9999978479008454 99.99999999999622 Annotation Cluster 4 Enrichment Score: 0.06705635298464693 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_KEYWORDS Disulfide bond 4 17.391304347826086 0.7052904766215429 ENSG00000050438, ENSG00000099617, ENSG00000143869, ENSG00000149541 22 3434 20581 1.0896913220733837 1.0 0.9995057831808567 99.99977860872981 UP_SEQ_FEATURE disulfide bond 3 13.043478260869565 0.8313843498748016 ENSG00000099617, ENSG00000143869, ENSG00000149541 22 2917 20063 0.9379032006731698 1.0 0.9999999999999281 99.99999951871355 UP_SEQ_FEATURE glycosylation site:N-linked (GlcNAc...) 3 13.043478260869565 0.9544875159034373 ENSG00000099617, ENSG00000143869, ENSG00000149541 22 4234 20063 0.6461652424099282 1.0 1.0 99.99999999999963 UP_KEYWORDS Glycoprotein 3 13.043478260869565 0.963461735118086 ENSG00000099617, ENSG00000143869, ENSG00000149541 22 4551 20581 0.616677653263019 1.0 0.9999984894709725 99.99999999999996
Compute the Spearman correlation of relative abundance between all pairs of bulk samples.
R = pd.Series([st.mstats.spearmanr(S[i], S[j]).correlation for i, j in itertools.combinations(S.columns, 2)])
R.describe()
Compute the Spearman correlation between all pairs of scRNA-Seq estimates.
R = pd.Series([st.mstats.spearmanr(T[i], T[j]).correlation for i, j in itertools.combinations(T.columns, 2)])
R.describe()
Compute the Spearman correlation between bulk and single cell.
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()
Compute the Spearman correlation for randomized pairs of bulk/single cell abundances.
np.random.seed(0)
pd.Series([st.mstats.spearmanr(S[i], T[j]).correlation
for i in np.random.choice(S.columns, 20, replace=True)
for j in np.random.choice(T.columns, 20, replace=True)]).describe()
Under our assumed model, the parameter \(μ\) is proportional to relative abundance.
log_mu = pd.read_table('/scratch/midway2/aksarkar/singlecell/density-estimation/without-cell-cycle/zi2-log-mu.txt.gz', sep=' ', index_col=0)
logodds = pd.read_table('/scratch/midway2/aksarkar/singlecell/density-estimation/without-cell-cycle/zi2-log-mu.txt.gz', sep=' ', index_col=0)
# Important: log(sigmoid(x)) = -softplus(-x)
log_mu -= np.log1p(np.exp(logodds))
log_mu -= log_mu.agg(sp.logsumexp, axis=0)
S, T = (bulk_log_rho.loc[keep_genes.values.ravel()]
.mask(mask)
.align(log_mu.mask(mask), join='inner'))
Plot the concordance.
plot_concordance_rho(
S,
T,
'/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/vs-sc-mean')
file:figure/sc-vs-bulk.org/vs-sc-mean/NA18507.svg file:figure/sc-vs-bulk.org/vs-sc-mean/NA18522.svg
Look at the genes only detected in bulk.
with open('/scratch/midway2/aksarkar/singlecell/density-estimation/bulk-only.txt', 'w') as f:
print(*set([x for k in T for x in pd.Series(T[k].mask(np.isfinite(S[k])).dropna().index)]), sep='\n', file=f)
Look at pathway enrichment for genes only detected in bulk.
curl "https://david.ncifcrf.gov/data/download/t2t_B5C5AE2A242A1525715859955.txt"
Compare ZINB estimate against pooled MLE.
S, T = log_mu.mask(mask).align(sc_log_rho.mask(mask), join='inner')
del S['NA18498']
del T['NA18498']
diff = abs(S - T)
diff.describe()
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()
plot_concordance_rho(
S,
T,
xlabel='ZINB ln relative abundance',
ylabel='Pooled ln relative abundance',
output_dir='/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/zinb-vs-pooled')
Look at \((1 - π) μ\).
logodds = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/without-cell-cycle/zi2-logodds.txt.gz', sep=' ', index_col=0)
corrected_log_mu = pd.read_table('/scratch/midway2/aksarkar/singlecell/density-estimation/without-cell-cycle/zi2-log-mu.txt.gz', sep=' ', index_col=0)
corrected_log_mu *= sp.expit(logodds)
corrected_log_mu -= sp.logsumexp(corrected_log_mu)
S, T = (bulk_log_rho.loc[keep_genes.values.ravel()]
.mask(mask)
.align(corrected_log_mu.mask(mask), join='inner'))
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()
S, T = (sc_log_rho.loc[keep_genes.values.ravel()]
.mask(mask)
.align(corrected_log_mu.mask(mask), join='inner'))
del S["NA18507"]
del T["NA18507"]
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()
Plot concordance between bulk vs pools of single cells, focusing on genes which have log-transformed expression at least 1 in both assays.
plot_concordance_by_num_cells(
'NA18507',
umi,
annotations,
bulk_log_tpm,
'/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/vs-cells/'
)
file:figure/sc-vs-bulk.org/vs-cells/NA18507-1.svg file:figure/sc-vs-bulk.org/vs-cells/NA18507-10.svg file:figure/sc-vs-bulk.org/vs-cells/NA18507-50.svg file:figure/sc-vs-bulk.org/vs-cells/NA18507-100.svg file:figure/sc-vs-bulk.org/vs-cells/NA18507-200.svg
Ensure that pools don’t overlap by randomly sampling double the cells and partitioning into two halves.
plot_concordance_pooled_subsets(
'NA18507',
umi,
annotations,
'/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/subsets/'
)
file:figure/sc-vs-bulk.org/subsets/NA18507-1.svg file:figure/sc-vs-bulk.org/subsets/NA18507-10.svg file:figure/sc-vs-bulk.org/subsets/NA18507-50.svg file:figure/sc-vs-bulk.org/subsets/NA18507-100.svg
Chu et al 2016 profiled hESC using single cell and matched bulk RNA-Seq (GSE75748). Analyze their data analagously to understand whether the correlation we observe is anomalous.
curl -sO --ftp-pasv ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75748/suppl/GSE75748_bulk_cell_type_ec.csv.gz
curl -sO --ftp-pasv ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75748/suppl/GSE75748_sc_cell_type_ec.csv.gz
chu_bulk_tpm = pd.read_table('/scratch/midway2/aksarkar/singlecell/hesc/GSE75748_bulk_cell_type_ec.csv.gz', sep=',', index_col=0)
T = np.log(chu_bulk_tpm) - np.log(chu_bulk_tpm.sum(axis=0))
R = pd.DataFrame([(i, j, st.mstats.spearmanr(T[i], T[j]).correlation) for i, j in it.combinations(sorted(T.columns), 2)])
M = R.pivot(index=0, columns=1, values=2).T
plt.clf()
plt.imshow(M, cmap=colorcet.cm['kr'])
cb = plt.colorbar()
cb.set_label('Spearman correlation')
plt.gca().set_aspect('equal')
plt.xticks(range(M.shape[0]), M.columns, rotation=90)
_ = plt.yticks(range(M.shape[1]), M.index)
chu_sc_tpm = pd.read_table('/scratch/midway2/aksarkar/singlecell/hesc/GSE75748_sc_cell_type_ec.csv.gz', sep=',', index_col=0)
for k in ('H1', 'H9', 'DEC', 'EC', 'HFF', 'NPC', 'TB'):
bulk_rho = chu_bulk_tpm.filter(like=k, axis='columns').agg(np.mean, axis=1)
bulk_rho = np.log(bulk_rho) - np.log(bulk_rho.sum(axis=0))
sc_rho = chu_sc_tpm.filter(like=k, axis='columns').agg(np.mean, axis=1)
sc_rho = np.log(sc_rho) - np.log(sc_rho.sum(axis=0))
x = sc_rho.mask(mask).dropna().to_frame()
y = bulk_rho.mask(mask).dropna().to_frame()
plot_concordance(
x=x,
y=y,
title=k,
gridsize=20,
filename='/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/hesc/{}.svg'.format(k),
xlabel='Single cell ln relative abundance',
ylabel='Bulk ln relative abundance',
)