Skip to content

Latest commit

 

History

History
1134 lines (976 loc) · 95.7 KB

sc-vs-bulk.org

File metadata and controls

1134 lines (976 loc) · 95.7 KB

Single cell/bulk RNA-Seq concordance

Introduction

Here, we investigate the correlation between single cell RNA-Seq and bulk RNA-Seq on samples from the same cell type in the same individuals. Our goal is to qualitatively assess our ability to call QTLs from scRNA-Seq.

Setup

(org-babel-lob-ingest "/home/aksarkar/.emacs.d/org-templates/library.org")
%config InlineBackend.figure_formats = set(['svg'])

import colorcet
import itertools as it
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.special as sp
import scipy.stats as st
import sqlite3

Implementation

def plot_concordance(x, y, title, filename, xlabel=None, ylabel=None, lim=None, **kwargs):
  """Plot hexbin of concordance"""
  merged = x.merge(y, left_index=True, right_index=True)
  merged.columns = ['x', 'y']
  if lim is None:
    lim = [merged.min().min(), merged.max().max()]
  plt.clf()
  if 'gridsize' not in kwargs:
    kwargs['gridsize'] = 40
  hexes = plt.hexbin(merged['x'], merged['y'], cmap=colorcet.cm['blues'], extent=lim + lim, **kwargs)
  ax = plt.gca()
  if lim is None:
    ax.set_xlim([merged['x'].min(), merged['x'].max()])
    ax.set_ylim([merged['y'].min(), merged['y'].max()])
  else:
    ax.set_xlim(lim)
    ax.set_ylim(lim)
  ax.set_aspect('equal')
  cb = plt.colorbar()
  cb.set_label('Number of genes')
  plt.plot(lim, lim, color='red')
  plt.title(title)
  if xlabel is None:
    xlabel = 'scRNA-Seq $\log_2(\mathrm{CPM} + 1) $'
  if ylabel is None:
    ylabel = 'Bulk RNA-Seq $\log_2(\mathrm{TPM} + 1)$'
  plt.xlabel(xlabel)
  plt.ylabel(ylabel)
  plt.savefig(filename)

def cpm(counts, size=None, log2=False):
  if size is None:
    size = counts.sum(axis=0)
  cpm = counts / size * 1e6
  if log2:
    cpm = np.log(cpm + 1) / np.log(2)
  return cpm
def plot_concordance_by_individual(umi, annotations, bulk, output_dir):
  bulk, pooled_cpm = bulk.align(
    cpm(umi.groupby(by=annotations['chip_id'].values, axis=1).agg(np.sum),
        size=annotations.groupby('chip_id')['mol_hs'].agg(np.sum), log2=True),
    axis=1, join='inner')
  for k in bulk:
    plot_concordance(
      x=pooled_cpm[k].to_frame(),
      y=bulk[k].to_frame(),
      title=k,
      filename='{}/{}.svg'.format(output_dir, k))
def plot_concordance_by_num_cells(individual, umi, annotations, bulk_tpm, output_dir):
  bulk_tpm = bulk_tpm[individual].to_frame()
  umi = umi.loc[:,(annotations['chip_id'] == individual).values]
  annotations = annotations[annotations['chip_id'] == individual]
  for num_cells in [1, 10, 50, 100, 200]:
    sample = np.random.choice(annotations.shape[0], size=num_cells)
    pooled_cpm = cpm(umi.iloc[:,sample].sum(axis=1).to_frame(),
                     size=annotations.iloc[sample]['mol_hs'].agg(np.sum), log2=True)
    plot_concordance(
      x=pooled_cpm,
      y=bulk_tpm,
      title='{}, {} cell{}'.format(individual, num_cells, 's' if num_cells > 1 else ''),
      filename='{}/{}-{}.svg'.format(output_dir, individual, num_cells),
      gridsize=20)
def plot_concordance_pooled_subsets(individual, umi, annotations, output_dir):
  umi = umi.loc[:,(annotations['chip_id'] == individual).values]
  for num_cells in [1, 10, 50, 100]:
    sample = umi.sample(n=2 * num_cells, axis=1)
    pool1 = cpm(sample.iloc[:,:num_cells].sum(axis=1).to_frame(), log2=True)
    pool2 = cpm(sample.iloc[:,num_cells:].sum(axis=1).to_frame(), log2=True)
    plot_concordance(
      x=pool1,
      y=pool2,
      title='{}, {} cell{}'.format(individual, num_cells, 's' if num_cells > 1 else ''),
      filename='{}/{}-{}.svg'.format(output_dir, individual, num_cells),
      ylabel='scRNA-Seq $\log_2(\mathrm{CPM} + 1)$',
      gridsize=15)
def plot_concordance_rho(bulk, sc, output_dir, **kwargs):
  if 'xlabel' not in kwargs:
    kwargs['xlabel'] = 'Single cell ln relative abundance'
  if 'ylabel' not in kwargs:
    kwargs['ylabel'] = 'Bulk ln relative abundance'
  bulk, sc = bulk.align(sc, axis=1, join='inner')
  for k in bulk:
    y = bulk[k].dropna().to_frame()
    x = sc[k].dropna().to_frame()
    plot_concordance(
      x=x,
      y=y,
      title=k,
      gridsize=20,
      **kwargs,
      filename='{}/{}.svg'.format(output_dir, k))
def mask(df):
  return ~np.isfinite(df)

Read the data

Read the QC files.

annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)
keep_genes = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/genes-pass-filter.txt', index_col=0, header=None)
annotations = annotations.loc[keep_samples.values.ravel()]

Read the UMI matrix.

umi = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz', index_col=0)
umi = umi.loc[:,keep_samples.values.ravel()]

The only quantity which is directly comparable between bulk and scRNA-Seq is relative abundance (Pachter 2011). Therefore, we re-processed the iPSC bulk RNA-Seq using kallisto.

Important: we need to quantify relative abundance with respect to exactly the same set of genes as the single cell data.

bulk_tpm = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/kallisto/bulk-ipsc-tpm.txt.gz', header=None, sep=' ').pivot(columns=0, index=1, values=2)
bulk_log_rho = np.log(bulk_tpm) - np.log(bulk_tpm.sum(axis=0))

Plot bulk vs. pooled single cells

Pool the single cells and estimate log CPM. We assume this is proportional to relative abundance (we assume UMIs really do directly count molecules).

sc_log_cpm = np.log(umi.groupby(annotations['chip_id'].values, axis=1).agg(np.sum)) - np.log(annotations.groupby('chip_id')['mol_hs'].agg(np.sum)) + 6 * np.log(10)
sc_log_rho = sc_log_cpm - sp.logsumexp(sc_log_cpm, axis=0)
S, T = (bulk_log_rho.loc[keep_genes.values.ravel()]
        .mask(mask)
        .align(sc_log_rho.loc[keep_genes.values.ravel()]
               .mask(mask), join='inner'))
plot_concordance_rho(
  S,
  T,
  '/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/pooled/')

Plot the individual with the most cells, and the fewest cells.

file:figure/sc-vs-bulk.org/pooled/NA18507.svg file:figure/sc-vs-bulk.org/pooled/NA18522.svg

Look at the distribution of absolute differences.

S, T = bulk_log_rho.align(sc_log_rho, join='inner')
diff = abs(S - T)
np.nanpercentile(diff.mask(~np.isfinite(diff)), [90, 95, 99, 99.5, 99.9])

Look at pathway enrichment for genes only detected in bulk.

bulk_only = set([x for k in T for x in pd.Series(T[k].mask(np.isfinite(S[k])).dropna().index)])
Annotation Cluster 1	Enrichment Score: 6.163250777011418
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
INTERPRO	IPR018064:Metallothionein, vertebrate, metal binding site	6	5.9405940594059405	5.214809864836233E-10	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	79	11	18559	128.14039125431532	8.343695168111509E-8	8.343695168111509E-8	6.296819954343391E-7
UP_SEQ_FEATURE	region of interest:Alpha	6	5.9405940594059405	1.0452172113872762E-9	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	80	13	20063	115.74807692307692	2.4144515486934637E-7	2.4144515486934637E-7	1.3409402699338102E-6
UP_SEQ_FEATURE	region of interest:Beta	6	5.9405940594059405	1.0452172113872762E-9	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	80	13	20063	115.74807692307692	2.4144515486934637E-7	2.4144515486934637E-7	1.3409402699338102E-6
UP_SEQ_FEATURE	metal ion-binding site:Divalent metal cation; cluster A	6	5.9405940594059405	1.6208966909643656E-9	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	80	14	20063	107.48035714285714	3.744270538064143E-7	1.8721354444473093E-7	2.0794964750159295E-6
UP_SEQ_FEATURE	metal ion-binding site:Divalent metal cation; cluster B	6	5.9405940594059405	1.6208966909643656E-9	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	80	14	20063	107.48035714285714	3.744270538064143E-7	1.8721354444473093E-7	2.0794964750159295E-6
INTERPRO	IPR003019:Metallothionein superfamily, eukaryotic	6	5.9405940594059405	2.2376038114295356E-9	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	79	14	18559	100.68173598553346	3.580165393035628E-7	1.7900828563899296E-7	2.7018795867306267E-6
INTERPRO	IPR000006:Metallothionein, vertebrate	6	5.9405940594059405	2.2376038114295356E-9	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	79	14	18559	100.68173598553346	3.580165393035628E-7	1.7900828563899296E-7	2.7018795867306267E-6
INTERPRO	IPR023587:Metallothionein domain, vertebrate	6	5.9405940594059405	2.2376038114295356E-9	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	79	14	18559	100.68173598553346	3.580165393035628E-7	1.7900828563899296E-7	2.7018795867306267E-6
INTERPRO	IPR017854:Metallothionein domain	6	5.9405940594059405	2.2376038114295356E-9	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	79	14	18559	100.68173598553346	3.580165393035628E-7	1.7900828563899296E-7	2.7018795867306267E-6
UP_KEYWORDS	Metal-thiolate cluster	6	5.9405940594059405	3.848645320957292E-9	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	97	14	20581	90.93225331369662	5.272642669140737E-7	5.272642669140737E-7	4.523761099051171E-6
GOTERM_BP_DIRECT	GO:0045926~negative regulation of growth	6	5.9405940594059405	1.6056085205036586E-8	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	75	19	16792	70.70315789473685	6.727477139589766E-6	6.727477139589766E-6	2.2546318878546856E-5
GOTERM_BP_DIRECT	GO:0071294~cellular response to zinc ion	6	5.9405940594059405	1.6056085205036586E-8	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	75	19	16792	70.70315789473685	6.727477139589766E-6	6.727477139589766E-6	2.2546318878546856E-5
UP_KEYWORDS	Cadmium	5	4.9504950495049505	5.5012039998665416E-8	ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417	97	9	20581	117.87514318442153	7.536621291603929E-6	3.7683177458447403E-6	6.466204057753444E-5
KEGG_PATHWAY	hsa04978:Mineral absorption	6	5.9405940594059405	6.197865698893864E-7	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	27	46	6910	33.38164251207729	3.1608625294388126E-5	3.1608625294388126E-5	5.995777879075348E-4
GOTERM_BP_DIRECT	GO:0071276~cellular response to cadmium ion	5	4.9504950495049505	7.917968157631074E-7	ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417	75	17	16792	65.85098039215686	3.317079698986758E-4	1.6586774100302293E-4	0.0011118534141041359
UP_KEYWORDS	Copper	5	4.9504950495049505	2.4203238770349687E-4	ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417	97	65	20581	16.321173671689134	0.03261860473945333	0.008256340267242868	0.2841189120846299
GOTERM_MF_DIRECT	GO:0046872~metal ion binding	13	12.871287128712872	0.12273080737129344	ENSG00000152977, ENSG00000125144, ENSG00000169715, ENSG00000153266, ENSG00000198105, ENSG00000125148, ENSG00000177932, ENSG00000215397, ENSG00000259332, ENSG00000205358, ENSG00000187193, ENSG00000121691, ENSG00000198417	69	2069	16881	1.5372055393279678	0.9999998290722704	0.7894858696046019	77.70573642627673
GOTERM_MF_DIRECT	GO:0008270~zinc ion binding	8	7.920792079207921	0.18975529795362245	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417	69	1169	16881	1.6742663740841297	0.9999999999866547	0.8758984183433024	91.03468733803763
GOTERM_CC_DIRECT	GO:0048471~perinuclear region of cytoplasm	6	5.9405940594059405	0.19285594142887852	ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417	91	621	18224	1.934915326219674	0.9999999919695161	0.9759592316379472	90.12891148150037
UP_KEYWORDS	Zinc	13	12.871287128712872	0.41448076703181924	ENSG00000152977, ENSG00000125144, ENSG00000169715, ENSG00000153266, ENSG00000198105, ENSG00000125148, ENSG00000177932, ENSG00000215397, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417	97	2348	20581	1.1747352429793287	1.0	0.9829889947051305	99.81480020819157
UP_KEYWORDS	Metal-binding	15	14.85148514851485	0.8237195669856142	ENSG00000125144, ENSG00000198105, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417, ENSG00000152977, ENSG00000169715, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000259332, ENSG00000121691, ENSG00000165188, ENSG00000247746	97	3640	20581	0.8743485895547751	1.0	0.9992576564560991	99.99999986203825

Annotation Cluster 2	Enrichment Score: 3.484819795455531
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
INTERPRO	IPR001152:Thymosin beta-4	4	3.9603960396039604	7.098533109747709E-7	ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164	79	5	18559	187.93924050632913	1.1357012050017268E-4	3.785814005408117E-5	8.571359595421768E-4
SMART	SM00152:THY	4	3.9603960396039604	7.766657663469284E-7	ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164	45	5	10057	178.79111111111112	2.873623162158445E-5	2.873623162158445E-5	6.973781416896863E-4
PIR_SUPERFAMILY	PIRSF001828:thymosin beta	4	3.9603960396039604	2.7080998708248916E-6	ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164	13	5	1692	104.12307692307694	2.708066868917225E-5	2.708066868917225E-5	0.0016261217555713081
GOTERM_BP_DIRECT	GO:0042989~sequestering of actin monomers	4	3.9603960396039604	9.642416279102543E-6	ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164	75	10	16792	89.55733333333333	0.004032041304637524	0.0013458241984475316	0.013539249589888946
GOTERM_MF_DIRECT	GO:0003785~actin monomer binding	4	3.9603960396039604	1.5210285357647118E-4	ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164	69	26	16881	37.63879598662207	0.017938766106036064	0.017938766106036064	0.1742005540888658
GOTERM_CC_DIRECT	GO:0031941~filamentous actin	4	3.9603960396039604	4.7367184906870326E-4	ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164	91	31	18224	25.840482098546616	0.04038125481188837	0.04038125481188837	0.5107377212681841
GOTERM_BP_DIRECT	GO:0007015~actin filament organization	4	3.9603960396039604	0.0039404588803159685	ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164	75	72	16792	12.43851851851852	0.8087767706624795	0.21048033854212067	5.393322740309592
UP_KEYWORDS	Actin-binding	5	4.9504950495049505	0.0394339844994075	ENSG00000034510, ENSG00000147481, ENSG00000205542, ENSG00000154620, ENSG00000158164	97	274	20581	3.871811272480999	0.9959614191508189	0.5449776981781902	37.680777033965654
GOTERM_BP_DIRECT	GO:0030036~actin cytoskeleton organization	3	2.9702970297029703	0.11220693889642849	ENSG00000034510, ENSG00000205542, ENSG00000158164	75	130	16792	5.166769230769231	1.0	0.9373656872217728	81.19896543130287
UP_KEYWORDS	Cytoskeleton	5	4.9504950495049505	0.7843259865566969	ENSG00000034510, ENSG00000147481, ENSG00000205542, ENSG00000154620, ENSG00000158164	97	1138	20581	0.9322287246571123	1.0	0.9985945289079377	99.99999852310495

Annotation Cluster 3	Enrichment Score: 1.7811609652286242
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
SMART	SM00389:HOX	8	7.920792079207921	9.417302516012567E-5	ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093	45	250	10057	7.151644444444445	0.003478501955001434	0.0017407661108270744	0.08452766486343188
INTERPRO	IPR001356:Homeodomain	8	7.920792079207921	1.0044839918367324E-4	ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093	79	256	18559	7.341376582278481	0.015944076888024683	0.0040100758644506795	0.12122279079956888
UP_SEQ_FEATURE	DNA-binding region:Homeobox	7	6.9306930693069315	1.0745524652182817E-4	ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093	80	191	20063	9.191164921465969	0.02451792636356398	0.008240359123208085	0.13776994804843845
UP_KEYWORDS	Homeobox	8	7.920792079207921	2.273035742769636E-4	ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093	97	262	20581	6.478633823876604	0.030664146783657475	0.010327675965698613	0.26685006254048016
GOTERM_BP_DIRECT	GO:0006366~transcription from RNA polymerase II promoter	10	9.900990099009901	4.090749986540058E-4	ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000054598	75	513	16792	4.364392462638077	0.15754707376630528	0.028168569886196426	0.5729025901226814
INTERPRO	IPR009057:Homeodomain-like	8	7.920792079207921	5.261429021979469E-4	ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093	79	336	18559	5.59342977697408	0.08075722475289715	0.01669998667646322	0.6334635185275994
GOTERM_MF_DIRECT	GO:0001077~transcriptional activator activity, RNA polymerase II core promoter proximal region sequence-specific binding	6	5.9405940594059405	0.002608319419483337	ENSG00000152977, ENSG00000129514, ENSG00000109132, ENSG00000119547, ENSG00000205922, ENSG00000164093	69	236	16881	6.219970523212969	0.2671362793302654	0.1439254000557343	2.949180047116129
UP_SEQ_FEATURE	compositionally biased region:Poly-Ala	7	6.9306930693069315	0.00518825052735661	ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000138083, ENSG00000054598, ENSG00000171450, ENSG00000163508	80	404	20063	4.3453279702970296	0.6992890245264455	0.25947910072369085	6.455682440417276
UP_KEYWORDS	DNA-binding	19	18.81188118811881	0.0061751724814120695	ENSG00000129514, ENSG00000184302, ENSG00000164853, ENSG00000067048, ENSG00000198105, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000152977, ENSG00000109132, ENSG00000188375, ENSG00000153266, ENSG00000119547, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000163508	97	2050	20581	1.9665023887352275	0.5719954819022799	0.15610255752991076	7.022162963087153
GOTERM_BP_DIRECT	GO:0007420~brain development	5	4.9504950495049505	0.00987285598589564	ENSG00000152977, ENSG00000053438, ENSG00000138083, ENSG00000054598, ENSG00000163508	75	190	16792	5.891929824561403	0.9843499743231441	0.36992744676798917	13.00554505004673
GOTERM_BP_DIRECT	GO:0045944~positive regulation of transcription from RNA polymerase II promoter	11	10.891089108910892	0.01060386709098585	ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508	75	981	16792	2.5105266734624534	0.9885153598169162	0.36024787175300976	13.90312024253032
UP_SEQ_FEATURE	compositionally biased region:Poly-Gly	5	4.9504950495049505	0.028203027199206778	ENSG00000109132, ENSG00000119547, ENSG00000138083, ENSG00000054598, ENSG00000163508	80	292	20063	4.294306506849315	0.9986511918732182	0.7333202527794147	30.72078907690975
GOTERM_MF_DIRECT	GO:0000981~RNA polymerase II transcription factor activity, sequence-specific DNA binding	4	3.9603960396039604	0.03168704614824783	ENSG00000129514, ENSG00000184302, ENSG00000138083, ENSG00000054598	69	171	16881	5.722857869310959	0.9783292834625844	0.7212010778222342	30.862537573604364
UP_KEYWORDS	Developmental protein	10	9.900990099009901	0.03297928048143198	ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000164853, ENSG00000053438, ENSG00000138083, ENSG00000164093, ENSG00000163508	97	949	20581	2.2357772152998816	0.9898911492926898	0.5350028435318056	32.57673990320743
GOTERM_MF_DIRECT	GO:0003677~DNA binding	13	12.871287128712872	0.03398271598877738	ENSG00000184302, ENSG00000129514, ENSG00000164853, ENSG00000067048, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000163508	69	1674	16881	1.8999272765051167	0.9836618555842146	0.6424797192945351	32.718143656703056
INTERPRO	IPR017970:Homeobox, conserved site	4	3.9603960396039604	0.04596070918167841	ENSG00000109132, ENSG00000164853, ENSG00000170561, ENSG00000164093	79	190	18559	4.94576948700866	0.9994622235937994	0.7148323276503672	43.341501655162176
GOTERM_MF_DIRECT	GO:0000977~RNA polymerase II regulatory region sequence-specific DNA binding	4	3.9603960396039604	0.05156656275239971	ENSG00000109132, ENSG00000205922, ENSG00000054598, ENSG00000163508	69	208	16881	4.704849498327759	0.9981642397951731	0.7163627995115522	45.49275378977236
GOTERM_MF_DIRECT	GO:0000978~RNA polymerase II core promoter proximal region sequence-specific DNA binding	5	4.9504950495049505	0.054849155170871446	ENSG00000152977, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000164093	69	355	16881	3.445805266380894	0.9987848382314719	0.673332685025507	47.616359931448414
GOTERM_MF_DIRECT	GO:0003700~transcription factor activity, sequence-specific DNA binding	7	6.9306930693069315	0.18959031696019613	ENSG00000152977, ENSG00000129514, ENSG00000177932, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508	69	961	16881	1.782065782925395	0.9999999999863274	0.8971154894254747	91.01374122380184
UP_KEYWORDS	Activator	6	5.9405940594059405	0.19590645201033402	ENSG00000152977, ENSG00000129514, ENSG00000109132, ENSG00000119547, ENSG00000205922, ENSG00000163508	97	661	20581	1.9259478765382037	0.9999999999998936	0.8995206058743108	92.29171287120641
GOTERM_BP_DIRECT	GO:0045893~positive regulation of transcription, DNA-templated	4	3.9603960396039604	0.3971154565879258	ENSG00000152977, ENSG00000143869, ENSG00000054598, ENSG00000163508	75	515	16792	1.7389773462783173	1.0	0.9999587761988841	99.91796526374138
UP_KEYWORDS	Transcription regulation	13	12.871287128712872	0.40500487412843467	ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508	97	2332	20581	1.1827951760357907	1.0	0.9847649688241881	99.77633739079752
UP_KEYWORDS	Transcription	13	12.871287128712872	0.44414461678326184	ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508	97	2398	20581	1.1502411803650807	1.0	0.9820944180997002	99.89948309755069
UP_KEYWORDS	Nucleus	25	24.752475247524753	0.5820328536133491	ENSG00000164853, ENSG00000253626, ENSG00000253506, ENSG00000159182, ENSG00000152977, ENSG00000109132, ENSG00000188375, ENSG00000147481, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000129514, ENSG00000184302, ENSG00000067048, ENSG00000198105, ENSG00000205922, ENSG00000170561, ENSG00000138083, ENSG00000250254, ENSG00000164093, ENSG00000182195, ENSG00000119547, ENSG00000153266, ENSG00000163508	97	5244	20581	1.0115143865940062	1.0	0.9956273086940396	99.99647757087135
GOTERM_BP_DIRECT	GO:0030154~cell differentiation	3	2.9702970297029703	0.6080833739283924	ENSG00000152977, ENSG00000119547, ENSG00000205922	75	462	16792	1.453852813852814	1.0	0.9999995136278385	99.99980612003067
UP_KEYWORDS	Disease mutation	9	8.91089108910891	0.9204249405761661	ENSG00000152977, ENSG00000184302, ENSG00000109132, ENSG00000124172, ENSG00000138083, ENSG00000168878, ENSG00000164093, ENSG00000054598, ENSG00000248099	97	2550	20581	0.7488538508186781	1.0	0.9999149050236531	99.99999999998799

Annotation Cluster 4	Enrichment Score: 1.165498308323919
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
GOTERM_BP_DIRECT	GO:0006413~translational initiation	6	5.9405940594059405	3.446046051185818E-4	ENSG00000067048, ENSG00000198692, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000205609	75	137	16792	9.805547445255476	0.1344708403894822	0.03545959319361591	0.4828157254203713
GOTERM_CC_DIRECT	GO:0022625~cytosolic large ribosomal subunit	3	2.9702970297029703	0.04452488666483783	ENSG00000198918, ENSG00000229117, ENSG00000163923	91	68	18224	8.835164835164836	0.9809854506297719	0.8621067464658695	38.874961529405915
UP_KEYWORDS	Ribosomal protein	4	3.9603960396039604	0.055666091991101146	ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923	97	185	20581	4.58757314015046	0.9996089745535879	0.6250048700776605	48.99391263132687
GOTERM_MF_DIRECT	GO:0003735~structural constituent of ribosome	4	3.9603960396039604	0.06034723008605642	ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923	69	222	16881	4.408147277712495	0.9993930954418766	0.6038222785369594	51.00471832879968
GOTERM_BP_DIRECT	GO:0006614~SRP-dependent cotranslational protein targeting to membrane	3	2.9702970297029703	0.06460194328297483	ENSG00000129824, ENSG00000198918, ENSG00000229117	75	94	16792	7.145531914893617	0.9999999999992961	0.8838033244845918	60.850545022072836
GOTERM_MF_DIRECT	GO:0003723~RNA binding	6	5.9405940594059405	0.06925428571255729	ENSG00000067048, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923, ENSG00000178997	69	547	16881	2.6835704633971864	0.9998046135816214	0.6128518819827384	56.07177294869783
GOTERM_BP_DIRECT	GO:0019083~viral transcription	3	2.9702970297029703	0.0874042522465415	ENSG00000129824, ENSG00000198918, ENSG00000229117	75	112	16792	5.997142857142857	1.0	0.922295808229385	72.31651234196252
KEGG_PATHWAY	hsa03010:Ribosome	3	2.9702970297029703	0.09197141519874578	ENSG00000129824, ENSG00000198918, ENSG00000229117	27	136	6910	5.645424836601307	0.9927041560195942	0.8060490790853818	60.67613480771449
GOTERM_BP_DIRECT	GO:0000184~nuclear-transcribed mRNA catabolic process, nonsense-mediated decay	3	2.9702970297029703	0.09683692108179212	ENSG00000129824, ENSG00000198918, ENSG00000229117	75	119	16792	5.64436974789916	1.0	0.9305572806891955	76.07463661943554
GOTERM_BP_DIRECT	GO:0006412~translation	4	3.9603960396039604	0.10082618317693195	ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923	75	253	16792	3.539815546772069	1.0	0.927158013883527	77.51659768922529
UP_KEYWORDS	Ribonucleoprotein	4	3.9603960396039604	0.16041445057926768	ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923	97	296	20581	2.867233212594037	0.9999999999604725	0.8641452261285151	87.1930135274217
GOTERM_BP_DIRECT	GO:0006364~rRNA processing	3	2.9702970297029703	0.24311129519341956	ENSG00000129824, ENSG00000198918, ENSG00000229117	75	214	16792	3.138691588785047	1.0	0.9978504982777531	97.9985714847756
GOTERM_CC_DIRECT	GO:0005829~cytosol	7	6.9306930693069315	0.9995612834920512	ENSG00000226784, ENSG00000125148, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000154620, ENSG00000121691	91	3315	18224	0.42287968441814594	1.0	1.0	100.0

Annotation Cluster 5	Enrichment Score: 0.820211096675127
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
GOTERM_MF_DIRECT	GO:0043565~sequence-specific DNA binding	6	5.9405940594059405	0.0575036128572627	ENSG00000129514, ENSG00000164853, ENSG00000170561, ENSG00000164093, ENSG00000054598, ENSG00000163508	69	518	16881	2.833808964243747	0.999130471945484	0.6346115628640061	49.27807833492119
GOTERM_MF_DIRECT	GO:0003700~transcription factor activity, sequence-specific DNA binding	7	6.9306930693069315	0.18959031696019613	ENSG00000152977, ENSG00000129514, ENSG00000177932, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508	69	961	16881	1.782065782925395	0.9999999999863274	0.8971154894254747	91.01374122380184
GOTERM_MF_DIRECT	GO:0008134~transcription factor binding	3	2.9702970297029703	0.31758166829967777	ENSG00000129514, ENSG00000164093, ENSG00000054598	69	284	16881	2.5843539497856707	1.0	0.9697360578283208	98.74714487738314

Annotation Cluster 6	Enrichment Score: 0.508309881872388
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
INTERPRO	IPR002110:Ankyrin repeat	3	2.9702970297029703	0.29082448887491896	ENSG00000222038, ENSG00000163046, ENSG00000196834	79	255	18559	2.763812360387193	1.0	0.9989646799915811	98.42286358849252
SMART	SM00248:ANK	3	2.9702970297029703	0.2975318758729139	ENSG00000222038, ENSG00000163046, ENSG00000196834	45	249	10057	2.692637215528782	0.9999978856103895	0.9871650256415595	95.80398991675298
INTERPRO	IPR020683:Ankyrin repeat-containing domain	3	2.9702970297029703	0.30637168905164147	ENSG00000222038, ENSG00000163046, ENSG00000196834	79	265	18559	2.6595175543348457	1.0	0.9985017380546117	98.79322761229355
UP_KEYWORDS	ANK repeat	3	2.9702970297029703	0.3494196849000245	ENSG00000222038, ENSG00000163046, ENSG00000196834	97	264	20581	2.411082474226804	1.0	0.9748006629928797	99.36098921844489

Annotation Cluster 7	Enrichment Score: 0.354336784895891
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
UP_SEQ_FEATURE	zinc finger region:C2H2-type 4	5	4.9504950495049505	0.20136843576824753	ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397	80	588	20063	2.132546768707483	1.0	0.9998260841655876	94.41306218498366
UP_SEQ_FEATURE	zinc finger region:C2H2-type 3	5	4.9504950495049505	0.24137654936132147	ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397	80	636	20063	1.9715998427672958	1.0	0.9998901331440954	97.1105095588215
INTERPRO	IPR013087:Zinc finger C2H2-type/integrase DNA-binding domain	5	4.9504950495049505	0.3509812706610015	ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397	79	712	18559	1.649747546579434	1.0	0.999008904105432	99.45921000508513
UP_SEQ_FEATURE	zinc finger region:C2H2-type 5	4	3.9603960396039604	0.3687510325334959	ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932	80	550	20063	1.823909090909091	1.0	0.9999925558095984	99.72665248783586
INTERPRO	IPR015880:Zinc finger, C2H2-like	5	4.9504950495049505	0.3990477278010214	ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397	79	762	18559	1.5414963952290774	1.0	0.999393086210869	99.78644025284262
UP_KEYWORDS	Transcription regulation	13	12.871287128712872	0.40500487412843467	ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508	97	2332	20581	1.1827951760357907	1.0	0.9847649688241881	99.77633739079752
UP_KEYWORDS	Zinc	13	12.871287128712872	0.41448076703181924	ENSG00000152977, ENSG00000125144, ENSG00000169715, ENSG00000153266, ENSG00000198105, ENSG00000125148, ENSG00000177932, ENSG00000215397, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417	97	2348	20581	1.1747352429793287	1.0	0.9829889947051305	99.81480020819157
SMART	SM00355:ZnF_C2H2	5	4.9504950495049505	0.4300934117849453	ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397	45	762	10057	1.4664625255176436	0.9999999990779891	0.994489588687009	99.35831314673327
INTERPRO	IPR007087:Zinc finger, C2H2	5	4.9504950495049505	0.4343150143180009	ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397	79	799	18559	1.4701129576527623	1.0	0.9994976627786998	99.89711105742633
UP_SEQ_FEATURE	zinc finger region:C2H2-type 2	4	3.9603960396039604	0.4378369016202355	ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397	80	615	20063	1.631138211382114	1.0	0.9999983334439706	99.93821123573167
UP_KEYWORDS	Transcription	13	12.871287128712872	0.44414461678326184	ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508	97	2398	20581	1.1502411803650807	1.0	0.9820944180997002	99.89948309755069
UP_SEQ_FEATURE	zinc finger region:C2H2-type 6	3	2.9702970297029703	0.590458495161822	ENSG00000153266, ENSG00000198105, ENSG00000177932	80	501	20063	1.5017215568862274	1.0	0.9999999655860128	99.99893817056615
UP_SEQ_FEATURE	zinc finger region:C2H2-type 1	3	2.9702970297029703	0.6455180669053127	ENSG00000153266, ENSG00000177932, ENSG00000215397	80	554	20063	1.3580550541516245	1.0	0.999999962991393	99.99983342537078
UP_KEYWORDS	Zinc-finger	7	6.9306930693069315	0.8479589820363391	ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397, ENSG00000165188, ENSG00000247746	97	1781	20581	0.8339285817651383	1.0	0.9994944279023904	99.99999997575446
GOTERM_BP_DIRECT	GO:0006351~transcription, DNA-templated	5	4.9504950495049505	0.9785388522558911	ENSG00000164853, ENSG00000198105, ENSG00000177932, ENSG00000215397, ENSG00000163508	75	1955	16792	0.572617220801364	1.0	1.0	100.0

Annotation Cluster 8	Enrichment Score: 0.06711873232279976
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
GOTERM_CC_DIRECT	GO:0005615~extracellular space	9	8.91089108910891	0.3473712675240062	ENSG00000165246, ENSG00000222038, ENSG00000198918, ENSG00000096088, ENSG00000196834, ENSG00000270136, ENSG00000143869, ENSG00000121691, ENSG00000168878	91	1347	18224	1.3380650529871019	0.9999999999999999	0.995027606334708	99.00691145271703
UP_SEQ_FEATURE	disulfide bond	7	6.9306930693069315	0.9799768322289583	ENSG00000165246, ENSG00000180974, ENSG00000096088, ENSG00000143869, ENSG00000164241, ENSG00000168878, ENSG00000248099	80	2917	20063	0.6018212204319506	1.0	1.0	100.0
UP_KEYWORDS	Disulfide bond	8	7.920792079207921	0.9979253382713639	ENSG00000165246, ENSG00000180974, ENSG00000096088, ENSG00000270136, ENSG00000143869, ENSG00000164241, ENSG00000168878, ENSG00000248099	97	3434	20581	0.4942929708374112	1.0	0.999999993214293	100.0
UP_SEQ_FEATURE	signal peptide	6	5.9405940594059405	0.9982749748270936	ENSG00000179542, ENSG00000165246, ENSG00000096088, ENSG00000143869, ENSG00000168878, ENSG00000248099	80	3346	20063	0.4497086072922893	1.0	1.0	100.0
UP_KEYWORDS	Signal	9	8.91089108910891	0.9995774779340586	ENSG00000179542, ENSG00000214194, ENSG00000165246, ENSG00000096088, ENSG00000270136, ENSG00000143869, ENSG00000168878, ENSG00000243317, ENSG00000248099	97	4160	20581	0.459033009516257	1.0	0.9999999998538799	100.0
UP_SEQ_FEATURE	glycosylation site:N-linked (GlcNAc...)	5	4.9504950495049505	0.9999874126486624	ENSG00000179542, ENSG00000165246, ENSG00000180974, ENSG00000143869, ENSG00000168878	80	4234	20063	0.2961590694378838	1.0	1.0	100.0
UP_KEYWORDS	Glycoprotein	5	4.9504950495049505	0.9999998667567829	ENSG00000179542, ENSG00000165246, ENSG00000180974, ENSG00000143869, ENSG00000168878	97	4551	20581	0.23310839126780789	1.0	1.0	100.0

Annotation Cluster 9	Enrichment Score: 0.014705889178132312
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
GOTERM_MF_DIRECT	GO:0005524~ATP binding	4	3.9603960396039604	0.9471714074328096	ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142	69	1495	16881	0.6545877562890795	1.0	0.9999999988516262	99.99999999999977
UP_KEYWORDS	ATP-binding	4	3.9603960396039604	0.9617258940215971	ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142	97	1391	20581	0.6101373335210891	1.0	0.9999922196380046	100.0
UP_KEYWORDS	Nucleotide-binding	4	3.9603960396039604	0.9917505356146884	ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142	97	1788	20581	0.4746650061117646	1.0	0.9999997700488767	100.0

Annotation Cluster 10	Enrichment Score: 0.01422812959020817
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
UP_SEQ_FEATURE	transmembrane region	16	15.841584158415841	0.9235699422433608	ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000180974, ENSG00000165246, ENSG00000165188, ENSG00000166002, ENSG00000205670	80	5056	20063	0.7936313291139241	1.0	0.9999999999999999	99.99999999999953
UP_KEYWORDS	Transmembrane helix	20	19.801980198019802	0.9664300899010495	ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165246, ENSG00000270136, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670	97	5634	20581	0.7531958030953453	1.0	0.9999910591386459	100.0
UP_KEYWORDS	Transmembrane	20	19.801980198019802	0.9677227406188967	ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165246, ENSG00000270136, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670	97	5651	20581	0.7509299512721953	1.0	0.9999895880346041	100.0
UP_SEQ_FEATURE	topological domain:Extracellular	7	6.9306930693069315	0.9713553106975322	ENSG00000171840, ENSG00000179542, ENSG00000165246, ENSG00000198133, ENSG00000180974, ENSG00000086159, ENSG00000180638	80	2787	20063	0.6298932543954072	1.0	1.0	100.0
UP_SEQ_FEATURE	topological domain:Cytoplasmic	9	8.91089108910891	0.9727885778326631	ENSG00000171840, ENSG00000179542, ENSG00000165246, ENSG00000198133, ENSG00000180974, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142	80	3456	20063	0.6530924479166668	1.0	1.0	100.0
GOTERM_CC_DIRECT	GO:0016021~integral component of membrane	18	17.82178217821782	0.9855047988667748	ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670	91	5163	18224	0.6981885052774071	1.0	0.9999999999999534	100.0
UP_KEYWORDS	Membrane	26	25.742574257425744	0.9884542831964291	ENSG00000171840, ENSG00000198133, ENSG00000170091, ENSG00000253626, ENSG00000180638, ENSG00000197142, ENSG00000228474, ENSG00000179542, ENSG00000180389, ENSG00000124172, ENSG00000110934, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000176533, ENSG00000127540, ENSG00000139370, ENSG00000205639, ENSG00000086159, ENSG00000178449, ENSG00000214194, ENSG00000165246, ENSG00000180974, ENSG00000270136, ENSG00000171450, ENSG00000205670	97	7494	20581	0.7361297973086373	1.0	0.9999995215886112	100.0

Look at genes only detected in single cell.

set(x for k in S for x in pd.Series(S[k].mask(np.isfinite(T[k])).dropna().index))

Look at pathway enrichments for genes detected only in single cell.

Annotation Cluster 1	Enrichment Score: 0.16470559032620852
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
GOTERM_BP_DIRECT	GO:0045944~positive regulation of transcription from RNA polymerase II promoter	3	13.043478260869565	0.3277257959734531	ENSG00000111087, ENSG00000184302, ENSG00000120690	21	981	16792	2.4453181884374544	1.0	0.9999999999888005	98.99989403807203
UP_KEYWORDS	DNA-binding	3	13.043478260869565	0.6331696404915614	ENSG00000111087, ENSG00000184302, ENSG00000120690	22	2050	20581	1.3690243902439023	1.0	0.9988514746581159	99.99771774457302
GOTERM_CC_DIRECT	GO:0005654~nucleoplasm	3	13.043478260869565	0.852891320214584	ENSG00000111087, ENSG00000120690, ENSG00000127080	22	2784	18224	0.8926332288401253	1.0	0.9999966100403458	99.99999864269819
GOTERM_CC_DIRECT	GO:0005634~nucleus	5	21.73913043478261	0.9101247789696545	ENSG00000111087, ENSG00000184302, ENSG00000109618, ENSG00000120690, ENSG00000127080	22	5415	18224	0.7648787039368757	1.0	0.9999990374355104	99.99999998711735
UP_KEYWORDS	Nucleus	4	17.391304347826086	0.9320660841557977	ENSG00000111087, ENSG00000184302, ENSG00000120690, ENSG00000127080	22	5244	20581	0.7135774218154081	1.0	0.9999972758163257	99.99999999996426

Annotation Cluster 2	Enrichment Score: 0.13876944094888757
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
UP_KEYWORDS	Membrane	10	43.47826086956522	0.3432466422296966	ENSG00000137502, ENSG00000102743, ENSG00000050438, ENSG00000099617, ENSG00000128283, ENSG00000149541, ENSG00000162188, ENSG00000197818, ENSG00000185818, ENSG00000213380	22	7494	20581	1.2483319989324793	0.9999999999999983	0.9772688861660149	98.86760814991567
UP_SEQ_FEATURE	transmembrane region	5	21.73913043478261	0.8140386056324314	ENSG00000102743, ENSG00000050438, ENSG00000149541, ENSG00000197818, ENSG00000185818	22	5056	20063	0.9018537830840045	1.0	0.9999999999999997	99.99999861987743
UP_KEYWORDS	Transmembrane helix	5	21.73913043478261	0.8670386049310165	ENSG00000102743, ENSG00000050438, ENSG00000149541, ENSG00000197818, ENSG00000185818	22	5634	20581	0.8302271920482783	1.0	0.99999148614062	99.99999995415865
UP_KEYWORDS	Transmembrane	5	21.73913043478261	0.8687936796993484	ENSG00000102743, ENSG00000050438, ENSG00000149541, ENSG00000197818, ENSG00000185818	22	5651	20581	0.8277296053795788	1.0	0.9999827433821622	99.99999996021165
GOTERM_CC_DIRECT	GO:0016021~integral component of membrane	4	17.391304347826086	0.9614936232995246	ENSG00000102743, ENSG00000149541, ENSG00000197818, ENSG00000185818	22	5163	18224	0.6417692321236773	1.0	0.9999999410477701	99.99999999999572

Annotation Cluster 3	Enrichment Score: 0.10814451882692515
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
GOTERM_CC_DIRECT	GO:0005576~extracellular region	3	13.043478260869565	0.5650202328331935	ENSG00000099617, ENSG00000143869, ENSG00000173401	22	1610	18224	1.5435347261434218	1.0	0.9995280289945352	99.96172821785414
UP_SEQ_FEATURE	signal peptide	3	13.043478260869565	0.8873328320761085	ENSG00000099617, ENSG00000143869, ENSG00000173401	22	3346	20063	0.8176520132587077	1.0	0.9999999999999631	99.99999999371153
UP_KEYWORDS	Signal	3	13.043478260869565	0.9449653686711603	ENSG00000099617, ENSG00000143869, ENSG00000173401	22	4160	20581	0.674639423076923	1.0	0.9999978479008454	99.99999999999622

Annotation Cluster 4	Enrichment Score: 0.06705635298464693
Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
UP_KEYWORDS	Disulfide bond	4	17.391304347826086	0.7052904766215429	ENSG00000050438, ENSG00000099617, ENSG00000143869, ENSG00000149541	22	3434	20581	1.0896913220733837	1.0	0.9995057831808567	99.99977860872981
UP_SEQ_FEATURE	disulfide bond	3	13.043478260869565	0.8313843498748016	ENSG00000099617, ENSG00000143869, ENSG00000149541	22	2917	20063	0.9379032006731698	1.0	0.9999999999999281	99.99999951871355
UP_SEQ_FEATURE	glycosylation site:N-linked (GlcNAc...)	3	13.043478260869565	0.9544875159034373	ENSG00000099617, ENSG00000143869, ENSG00000149541	22	4234	20063	0.6461652424099282	1.0	1.0	99.99999999999963
UP_KEYWORDS	Glycoprotein	3	13.043478260869565	0.963461735118086	ENSG00000099617, ENSG00000143869, ENSG00000149541	22	4551	20581	0.616677653263019	1.0	0.9999984894709725	99.99999999999996

Compute the Spearman correlation of relative abundance between all pairs of bulk samples.

R = pd.Series([st.mstats.spearmanr(S[i], S[j]).correlation for i, j in itertools.combinations(S.columns, 2)])
R.describe()

Compute the Spearman correlation between all pairs of scRNA-Seq estimates.

R = pd.Series([st.mstats.spearmanr(T[i], T[j]).correlation for i, j in itertools.combinations(T.columns, 2)])
R.describe()

Compute the Spearman correlation between bulk and single cell.

pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()

Compute the Spearman correlation for randomized pairs of bulk/single cell abundances.

np.random.seed(0)
pd.Series([st.mstats.spearmanr(S[i], T[j]).correlation
           for i in np.random.choice(S.columns, 20, replace=True)
           for j in np.random.choice(T.columns, 20, replace=True)]).describe()

Plot bulk vs. single cell relative abundance

Under our assumed model, the parameter \(μ\) is proportional to relative abundance.

log_mu = pd.read_table('/scratch/midway2/aksarkar/singlecell/density-estimation/without-cell-cycle/zi2-log-mu.txt.gz', sep=' ', index_col=0)
logodds = pd.read_table('/scratch/midway2/aksarkar/singlecell/density-estimation/without-cell-cycle/zi2-log-mu.txt.gz', sep=' ', index_col=0)
 # Important: log(sigmoid(x)) = -softplus(-x)
log_mu -= np.log1p(np.exp(logodds))
log_mu -= log_mu.agg(sp.logsumexp, axis=0)
S, T = (bulk_log_rho.loc[keep_genes.values.ravel()]
        .mask(mask)
        .align(log_mu.mask(mask), join='inner'))

Plot the concordance.

plot_concordance_rho(
  S,
  T,
  '/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/vs-sc-mean')

file:figure/sc-vs-bulk.org/vs-sc-mean/NA18507.svg file:figure/sc-vs-bulk.org/vs-sc-mean/NA18522.svg

Look at the genes only detected in bulk.

with open('/scratch/midway2/aksarkar/singlecell/density-estimation/bulk-only.txt', 'w') as f:
  print(*set([x for k in T for x in pd.Series(T[k].mask(np.isfinite(S[k])).dropna().index)]), sep='\n', file=f)

Look at pathway enrichment for genes only detected in bulk.

curl "https://david.ncifcrf.gov/data/download/t2t_B5C5AE2A242A1525715859955.txt"

Compare ZINB estimate against pooled MLE.

S, T = log_mu.mask(mask).align(sc_log_rho.mask(mask), join='inner')
del S['NA18498']
del T['NA18498']
diff = abs(S - T)
diff.describe()

pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()

plot_concordance_rho(
  S,
  T,
  xlabel='ZINB ln relative abundance',
  ylabel='Pooled ln relative abundance',
  output_dir='/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/zinb-vs-pooled')

Look at \((1 - π) μ\).

logodds = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/without-cell-cycle/zi2-logodds.txt.gz', sep=' ', index_col=0)
corrected_log_mu = pd.read_table('/scratch/midway2/aksarkar/singlecell/density-estimation/without-cell-cycle/zi2-log-mu.txt.gz', sep=' ', index_col=0)
corrected_log_mu *= sp.expit(logodds)
corrected_log_mu -= sp.logsumexp(corrected_log_mu)
S, T = (bulk_log_rho.loc[keep_genes.values.ravel()]
        .mask(mask)
        .align(corrected_log_mu.mask(mask), join='inner'))
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()

S, T = (sc_log_rho.loc[keep_genes.values.ravel()]
        .mask(mask)
        .align(corrected_log_mu.mask(mask), join='inner'))
del S["NA18507"]
del T["NA18507"]
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()

Plot bulk vs. pooled subsets

Plot concordance between bulk vs pools of single cells, focusing on genes which have log-transformed expression at least 1 in both assays.

plot_concordance_by_num_cells(
  'NA18507',
  umi,
  annotations,
  bulk_log_tpm,
  '/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/vs-cells/'
)

file:figure/sc-vs-bulk.org/vs-cells/NA18507-1.svg file:figure/sc-vs-bulk.org/vs-cells/NA18507-10.svg file:figure/sc-vs-bulk.org/vs-cells/NA18507-50.svg file:figure/sc-vs-bulk.org/vs-cells/NA18507-100.svg file:figure/sc-vs-bulk.org/vs-cells/NA18507-200.svg

Plot pooled subsets vs. pooled subsets

Ensure that pools don’t overlap by randomly sampling double the cells and partitioning into two halves.

plot_concordance_pooled_subsets(
  'NA18507',
  umi,
  annotations,
  '/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/subsets/'
)

file:figure/sc-vs-bulk.org/subsets/NA18507-1.svg file:figure/sc-vs-bulk.org/subsets/NA18507-10.svg file:figure/sc-vs-bulk.org/subsets/NA18507-50.svg file:figure/sc-vs-bulk.org/subsets/NA18507-100.svg

Chu et al hESC

Chu et al 2016 profiled hESC using single cell and matched bulk RNA-Seq (GSE75748). Analyze their data analagously to understand whether the correlation we observe is anomalous.

curl -sO --ftp-pasv ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75748/suppl/GSE75748_bulk_cell_type_ec.csv.gz
curl -sO --ftp-pasv ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75748/suppl/GSE75748_sc_cell_type_ec.csv.gz
chu_bulk_tpm = pd.read_table('/scratch/midway2/aksarkar/singlecell/hesc/GSE75748_bulk_cell_type_ec.csv.gz', sep=',', index_col=0)
T = np.log(chu_bulk_tpm) - np.log(chu_bulk_tpm.sum(axis=0))
R = pd.DataFrame([(i, j, st.mstats.spearmanr(T[i], T[j]).correlation) for i, j in it.combinations(sorted(T.columns), 2)])
M = R.pivot(index=0, columns=1, values=2).T
plt.clf()
plt.imshow(M, cmap=colorcet.cm['kr'])
cb = plt.colorbar()
cb.set_label('Spearman correlation')
plt.gca().set_aspect('equal')
plt.xticks(range(M.shape[0]), M.columns, rotation=90)
_ = plt.yticks(range(M.shape[1]), M.index)
chu_sc_tpm = pd.read_table('/scratch/midway2/aksarkar/singlecell/hesc/GSE75748_sc_cell_type_ec.csv.gz', sep=',', index_col=0)
for k in ('H1', 'H9', 'DEC', 'EC', 'HFF', 'NPC', 'TB'):
  bulk_rho = chu_bulk_tpm.filter(like=k, axis='columns').agg(np.mean, axis=1)
  bulk_rho = np.log(bulk_rho) - np.log(bulk_rho.sum(axis=0))
  sc_rho = chu_sc_tpm.filter(like=k, axis='columns').agg(np.mean, axis=1)
  sc_rho = np.log(sc_rho) - np.log(sc_rho.sum(axis=0))
  x = sc_rho.mask(mask).dropna().to_frame()
  y = bulk_rho.mask(mask).dropna().to_frame()
  plot_concordance(
    x=x,
    y=y,
    title=k,
    gridsize=20,
    filename='/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/hesc/{}.svg'.format(k),
    xlabel='Single cell ln relative abundance',
    ylabel='Bulk ln relative abundance',
  )