Skip to content

Commit

Permalink
Take more samples to compute randomized quantiles
Browse files Browse the repository at this point in the history
  • Loading branch information
aksarkar committed Dec 2, 2018
1 parent 16d9e02 commit de4ff6a
Showing 1 changed file with 163 additions and 83 deletions.
246 changes: 163 additions & 83 deletions analysis/zinb.org
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@
#+END_SRC

#+RESULTS:
: 19
: 3

#+CALL: ipython3(memory="16G",venv="scqtl") :dir /scratch/midway2/aksarkar/singlecell

#+RESULTS:
: Submitted batch job 53595685
: Submitted batch job 54477665

#+NAME: zinb-imports
#+BEGIN_SRC ipython
Expand All @@ -87,7 +87,7 @@

#+RESULTS: zinb-imports
:RESULTS:
# Out[1]:
# Out[2]:
:END:

#+BEGIN_SRC ipython
Expand All @@ -100,7 +100,7 @@

#+RESULTS:
:RESULTS:
# Out[2]:
# Out[3]:
:END:

#+NAME: tf-imports
Expand Down Expand Up @@ -359,6 +359,7 @@

** Simulation

#+NAME: tf-sim.py
#+BEGIN_SRC ipython :eval never :noweb tangle :tangle /project2/mstephens/aksarkar/projects/singlecell-qtl/code/tf-sim.py
<<zinb-imports>>
<<tf-imports>>
Expand Down Expand Up @@ -387,7 +388,7 @@
design=design.astype(np.float32),
size_factor=(num_mols * np.ones((num_samples * num_trials, 1))).astype(np.float32),
learning_rate=5e-2,
max_epochs=8000)
max_epochs=4000)
result = pd.DataFrame(
[(a[0] // num_trials, int(a[1]), int(a[2]), int(a[3]), int(a[4]), a[-1], trial)
for a in args
Expand All @@ -404,24 +405,26 @@
result['var_log_cpm'] = log_cpm.var(axis=1).ravel(order='F')

diagnostic = []
for k, x in umi.iterrows():
diagnostic.append(diagnostic_test(x, log_mu.loc[k], log_phi.loc[k], logodds.loc[k], annotations['mol_hs'], onehot))
for i in range(umi.shape[1]):
for j in range(onehot.shape[1]):
idx = onehot[:,j].astype(bool)
diagnostic.append(diagnostic_test(umi[idx,i], log_mu[j,i], log_phi[j,i], -logodds[j,i], num_mols, np.ones((num_samples, 1)))[1])
result['diagnostic'] = diagnostic
return result

res = evaluate(num_samples=95, num_mols=114026)
res = evaluate(num_samples=95, num_mols=114026, num_trials=50)
res.to_csv('simulation.txt.gz', compression='gzip', sep='\t')
#+END_SRC

#+BEGIN_SRC sh :dir /scratch/midway2/aksarkar/singlecell/density-estimation/
sbatch --partition=gpu2 --gres=gpu:1 --mem=16G --time=240 --job-name=tf-sim --output=sim.out
sbatch --partition=gpu --gres=gpu:1 --mem=16G --time=30 --job-name=tf-sim --output=sim.out
#!/bin/bash
source activate scqtl
python /project2/mstephens/aksarkar/projects/singlecell-qtl/code/tf-sim.py
#+END_SRC

#+RESULTS:
: Submitted batch job 53610363
: Submitted batch job 54073346

Read the results.

Expand Down Expand Up @@ -711,7 +714,7 @@

Optimize the negative log-likelihood.

#+NAME: np-zinb-impl2
#+NAME: np-zinb-impl
#+BEGIN_SRC ipython
def log(x):
"""Numerically safe log"""
Expand Down Expand Up @@ -802,9 +805,9 @@
return orig + list(se.ravel())
#+END_SRC

#+RESULTS: np-zinb-impl2
#+RESULTS: np-zinb-impl
:RESULTS:
# Out[26]:
# Out[5]:
:END:

Computing analytic SE runs into numerical problems.
Expand Down Expand Up @@ -895,9 +898,9 @@
return np.vstack((x, size * np.ones(num_samples))).T, design
#+END_SRC

#+RESULTS: sim-impl2
#+RESULTS: sim-impl
:RESULTS:
# Out[31]:
# Out[4]:
:END:

#+NAME: numpy-eval-impl
Expand Down Expand Up @@ -2061,7 +2064,7 @@

#+RESULTS:
:RESULTS:
# Out[50]:
# Out[3]:
#+BEGIN_EXAMPLE
pi0 trial ks sw
0 NaN 0 0.030844 0.145921
Expand Down Expand Up @@ -2097,129 +2100,206 @@
#+END_EXAMPLE
:END:

Read the estimated parameters.
Implement the diagnostic test.

#+NAME: zinb-diagnostic
#+BEGIN_SRC ipython
log_mu = pd.read_table("/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/zi2-log-mu.txt.gz", index_col=0, sep=' ')
log_phi = pd.read_table("/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/zi2-log-phi.txt.gz", index_col=0, sep=' ')
logodds = pd.read_table("/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/zi2-logodds.txt.gz", index_col=0, sep=' ')
def rpp(x, log_mu, log_phi, logodds, size, onehot, n_samples=1):
n = onehot.dot(np.exp(-log_phi))
pi0 = onehot.dot(sp.expit(-logodds))
p = 1 / (1 + (size * onehot.dot(np.exp(log_mu + log_phi))))

cdf = st.nbinom(n=n, p=p).cdf(x - 1)
# Important: this excludes the right endpoint, so we need to special case x =
# 0
cdf = np.where(x > 0, pi0 + (1 - pi0) * cdf, cdf)
pmf = st.nbinom(n=n, p=p).pmf(x)
pmf *= (1 - pi0)
pmf[x == 0] += pi0[x == 0]
rpp = cdf + np.random.uniform(size=(n_samples, x.shape[0])) * pmf
return rpp

def diagnostic_test(x, log_mu, log_phi, logodds, size, onehot):
rpp = rpp(x, log_mu, log_phi, logodds, size, onehot)
return st.kstest(rpp, 'uniform')
#+END_SRC

#+RESULTS:
#+RESULTS: zinb-diagnostic
:RESULTS:
# Out[51]:
# Out[24]:
:END:

Read the annotations.
Look at randomized residuals for some simulated data. Use the CPU
implementation for convenience.

#+BEGIN_SRC ipython
keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)
annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
annotations = annotations.loc[keep_samples.values.ravel()]
#+END_SRC
#+CALL: np-zinb-impl()

#+RESULTS:
:RESULTS:
# Out[52]:
# Out[50]:
:END:

#+CALL: recode-impl()
#+CALL: sim-impl()

#+RESULTS:
:RESULTS:
# Out[53]:
# Out[51]:
:END:

#+BEGIN_SRC ipython
onehot = recode(annotations, 'chip_id')
#+BEGIN_SRC ipython :ipyfile figure/zinb.org/simulated-rqrs.png
X, design = simulate(num_samples=100, size=1e5, log_mu=-13, log_phi=-3, logodds=0, seed=0, design=None, fold=None)
res = _fit_gene(X, np.ones((X.shape[0], 1)), design)
Z = rpp(X[:,0], res[0], res[1], -res[2], 1e5, np.ones((X.shape[0], 1)), n_samples=100)

plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.plot(sorted(Z.ravel()), np.linspace(0, 1, np.prod(Z.shape)), c='k')
plt.text(.05, .92, 'ks = {:.2g}'.format(st.kstest(Z.ravel(), 'uniform')[1]), transform=plt.gca().transAxes)
plt.text(.05, .82, r'$\ln\phi = -3$', transform=plt.gca().transAxes)
plt.text(.05, .72, r'$\ln\hat\phi = {:.2g}$'.format(res[1][0]), transform=plt.gca().transAxes)
plt.xlabel('Theoretical quantiles')
plt.ylabel('Observed quantiles')
plt.plot([0, 1], [0, 1], c='r', ls='--')
#+END_SRC

#+RESULTS:
:RESULTS:
# Out[54]:
# Out[150]:
: [<matplotlib.lines.Line2D at 0x7fd7a8e0b358>]
[[file:figure/zinb.org/simulated-rqrs.png]]
:END:

#+BEGIN_SRC ipython :ipyfile figure/zinb.org/simulated-nrpps.png
Z = st.norm().ppf(Z)
plt.clf()
plt.gcf().set_size_inches(3, 3)
M = int(X[:,0].max())
for v in range(M):
if not (X == v).any():
continue
vp = plt.violinplot(Z[:,X[:,0] == v].ravel(), [v])
for b in vp['bodies']:
b.set_facecolor('k')
vp['cbars'].set_color('k')
vp['cmins'].set_color('k')
vp['cmaxes'].set_color('k')
plt.xlabel('UMI count')
plt.ylabel('Randomized residual')

#+END_SRC

Run the diagnostic on the estimated parameters.

#+BEGIN_SRC ipython :eval never :noweb tangle :tangle /project2/mstephens/aksarkar/projects/singlecell-qtl/code/tf-diagnostic.py
<<zinb-imports>>
<<recode-impl>>
<<zinb-diagnostic>>

log_mu = pd.read_table("/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/zi2-log-mu.txt.gz", index_col=0, sep=' ')
log_phi = pd.read_table("/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/zi2-log-phi.txt.gz", index_col=0, sep=' ')
logodds = pd.read_table("/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/zi2-logodds.txt.gz", index_col=0, sep=' ')

keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)
annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
annotations = annotations.loc[keep_samples.values.ravel()]

onehot = recode(annotations, 'chip_id')

result = []
for chunk in pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz', index_col=0, chunksize=100):
chunk = chunk.loc[:,keep_samples.values.ravel()].filter(items=log_mu.index, axis='index')
if chunk.empty:
continue
for k, x in chunk.iterrows():
stat, p = diagnostic_test(x, log_mu.loc[k], log_phi.loc[k], logodds.loc[k], annotations['mol_hs'], onehot)
result.append((k, stat, p))
pd.DataFrame(result).set_index(0).to_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/diagnostic.txt.gz', compression='gzip', sep='\t')
#+END_SRC

#+BEGIN_SRC sh :dir /scratch/midway2/aksarkar/singlecell/density-estimation
sbatch --partition=broadwl --job-name="tf-diagnostic" --mem=8G --time=8:00:00
#!/bin/bash
source activate scqtl
python /project2/mstephens/aksarkar/projects/singlecell-qtl/code/tf-diagnostic.py
#+END_SRC

#+RESULTS:
: Submitted batch job 54080417

Look at randomized quantile residuals for the gene with least departure from
ZINB.

#+BEGIN_SRC ipython
chip = recode(annotations, 'experiment')
chip -= chip.mean(axis=0)
diagnostic_results = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/diagnostic.txt.gz', index_col=0)
#+END_SRC

#+RESULTS:
:RESULTS:
# Out[55]:
# Out[5]:
:END:

Implement the diagnostic test.
#+CALL: recode-impl()

#+RESULTS:
:RESULTS:
# Out[9]:
:END:

#+NAME: zinb-diagnostic
#+BEGIN_SRC ipython
def diagnostic_test(x, log_mu, log_phi, logodds, size, onehot):
n = onehot.dot(np.exp(-log_phi))
pi0 = onehot.dot(sp.expit(-logodds))
p = 1 / (1 + (annotations['mol_hs'] * onehot.dot(np.exp(log_mu + log_phi))))
log_mu = pd.read_table("/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/zi2-log-mu.txt.gz", index_col=0, sep=' ')
log_phi = pd.read_table("/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/zi2-log-phi.txt.gz", index_col=0, sep=' ')
logodds = pd.read_table("/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design0/zi2-logodds.txt.gz", index_col=0, sep=' ')

cdf = st.nbinom(n=n, p=p).cdf(x - 1)
# Important: this excludes the right endpoint, so we need to special case x =
# 0
cdf = np.where(x > 0, pi0 + (1 - pi0) * cdf, cdf)
pmf = st.nbinom(n=n, p=p).pmf(x)
pmf *= (1 - pi0)
pmf[x == 0] += pi0[x == 0]
rpp = cdf + np.random.uniform(size=x.shape[0]) * pmf
nrpp = st.norm().ppf(rpp)
return st.shapiro(nrpp)
keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)
annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
annotations = annotations.loc[keep_samples.values.ravel()]

onehot = recode(annotations, 'chip_id')
#+END_SRC

#+RESULTS:
:RESULTS:
# Out[60]:
# Out[10]:
:END:

#+BEGIN_SRC ipython
keep_genes = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/genes-pass-filter.txt', index_col=0, header=None)

result = []
k = diagnostic_results['2'].idxmax()
for chunk in pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz', index_col=0, chunksize=100):
chunk = chunk.loc[:,keep_samples.values.ravel()].filter(items=log_mu.index, axis='index')
if chunk.empty:
continue
for k, x in chunk.iterrows():
result.append(diagnostic_test(x, log_mu.loc[k], log_phi.loc[k], logodds.loc[k], annotations['mol_hs'], onehot))
break
chunk = chunk.loc[:,keep_samples.values.ravel()].filter(items=[k], axis='index')
if not chunk.empty:
break
#+END_SRC

#+RESULTS:
:RESULTS:
# Out[61]:
# Out[18]:
:END:

Try and generate new data from the estimated parameters, and compare it to
the real data.
#+CALL: get-gene-info()

#+BEGIN_SRC ipython :ipyfile figure/zinb.org/ppc.png
def sample(log_mu, log_phi, logodds, size, onehot):
n = onehot.dot(np.exp(-log_phi))
p = 1 / (1 + (annotations['mol_hs'] * onehot.dot(np.exp(log_mu + log_phi))))
res = st.nbinom(n=n, p=p).rvs(size=onehot.shape[0])
pi0 = onehot.dot(sp.expit(logodds))
y = np.random.uniform(size=onehot.shape[0]) < pi0
res[y] = 0
return res
#+RESULTS:
:RESULTS:
# Out[126]:
:END:

k = 'ENSG00000005486'
Look at the QQ plot of the randomized residuals.

#+BEGIN_SRC ipython :ipyfile figure/zinb.org/rqrs.png
Z = rpp(chunk.loc[k], log_mu.loc[k], log_phi.loc[k], logodds.loc[k], annotations['mol_hs'], onehot, n_samples=10)
plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.hist(chunk.loc[k], histtype='step', lw=3, color='black', bins=chunk.loc[k].max())
for _ in range(5):
plt.hist(sample(log_mu.loc[k], log_phi.loc[k], logodds.loc[k], annotations['mol_hs'], onehot), histtype='step', bins=chunk.loc[k].max())
plt.xlabel('UMI count')
_ = plt.ylabel('Number of cells')
plt.plot(sorted(Z.ravel()), np.linspace(0, 1, np.prod(Z.shape)), c='k')
plt.plot([0, 1], [0, 1], c='r', ls='--')
plt.text(.05, .92, 'ks = {:.2g}'.format(st.kstest(Z.ravel(), 'uniform')[1]), transform=plt.gca().transAxes)
plt.title(gene_info.loc[k, 'name'])
plt.xlabel('Theoretical quantiles')
plt.ylabel('Observed quantiles')
#+END_SRC

#+RESULTS:
:RESULTS:
# Out[69]:
[[file:figure/zinb.org/ppc.png]]
# Out[131]:
: Text(0,0.5,'Observed quantiles')
[[file:figure/zinb.org/rqrs.png]]
:END:

0 comments on commit de4ff6a

Please sign in to comment.