-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunBLADE.py
154 lines (118 loc) · 5.24 KB
/
runBLADE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
# coding: utf-8
# In[4]:
import sys, os
from Deconvolution.BLADE import Framework
import numpy as np
from numpy import transpose as t
import itertools
import pickle
from scipy.optimize import nnls
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.metrics import mean_squared_error as mse
import pandas as pd
# modules for visualization
import qgrid
from matplotlib import pyplot as plt
import seaborn as sns
import logging
# logging
#LOG = "/home/cke/runBLADE.log"
#logging.basicConfig(filename=LOG, filemode="w", level=logging.DEBUG,format='%(asctime)s %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
# console handler
#console = logging.StreamHandler()
#console.setLevel(logging.ERROR)
#logging.getLogger("").addHandler(console)
# ### Run BLADE with TCGA bulk and Puram scRNA-seq reference
#
# #### Application of deconvolution methods
#
# From here, we will apply the following three methods for further performance comparison:
# 1. BLADE (estimation of cellular fraction + group-mode/high-resolution-mode purification)
# 2. NNLS (estimation of fraction)
# 3. SVR followed by NNLS (estimation of fraction + group-mode purification) - similar to CIBERSORTx
#
#
# ##### 1. Application of BLADE
#
# These are the key parameters used in BLADE (note that there is default settings of these parameters, if not specified):
# - Hyperparameters (`hyperpars`): `Alpha`, `Alpha0`, `Kappa0` and `SigmaY`, each of which can be defined as a list of options. BLADE takes an empirical Bayes approach to find the optimal parameter set given the all possible combinations.
# - `Nrep`: Number of repeat for evaluating each parameter configuration.
# - `Nrepfinal`: Number of repeated optimizations for the final parameter set.
# - `Njob`: Number of parallel jobs.
# In[17]:
hyperpars = {
'Alpha': [1, 10],
'Alpha0': [0.1, 0.5, 1, 5, 10],
'Kappa0': [1, 0.5, 0.1],
'SY': np.sqrt([0.1, 0.5, 1, 1.5, 2])
}
Nrep=3
Nrepfinal=10
Njob=20
# In[10]:
# read in marker genes
marker_genes = pd.read_csv("/home/cke/Puram/top100markers_de_cor.txt",header=None).iloc[0,:]
# df_Puram_std = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_std.tsv",sep='\t',index_col=0)
# df_Puram_mean = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_mean.tsv",sep='\t',index_col=0)
# merged all tumor cell types
df_Puram_std = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_std_simple.tsv",sep='\t',index_col=0)
df_Puram_mean = pd.read_csv("/home/cke/Puram/HNSCC2PuramGSE103322_HNSCC_exp_mean_simple.tsv",sep='\t',index_col=0)
df_Puram_std_filtered = df_Puram_std.loc[marker_genes,:]
df_Puram_mean_filtered = df_Puram_mean.loc[marker_genes,:]
df_TCGA = pd.read_csv("/home/cke/TCGA-HNSC.htseq_counts_exp2.tsv",sep='\t',index_col=0)
df_Puram_mean_log2 = np.log2(df_Puram_mean_filtered+1)
merge_genes_mean = pd.merge(df_Puram_mean_log2,df_TCGA,left_index=True,right_index=True,how='inner')
merge_genes_std = pd.merge(df_Puram_std_filtered,df_TCGA,left_index=True,right_index=True,how='inner')
print("Get mean and std exp!")
# 21706 genes in common
# df_TCGA_shared = merge_genes_mean.iloc[:,24:]
# df_shared_mean = merge_genes_mean.iloc[:,:24]
# df_shared_std = merge_genes_std.iloc[:,:24]
#simple tumor cell type setup
df_TCGA_shared = merge_genes_mean.iloc[:,10:]
df_shared_mean = merge_genes_mean.iloc[:,:10]
df_shared_std = merge_genes_std.iloc[:,:10]
print("Get common genes! ",df_shared_mean.shape[0])
print("cell types: ",df_shared_mean.shape[1])
print("bulk samples: ",df_TCGA_shared.shape[1])
# Given the configuration above, BLADE is applied to each of the simulation dataset created previously.
#
# BLADE produce several outcomes:
# - `final_obj`: final BLADE object with optimized variational parameters
# - `best_obj`: BLADE object trained with the best parameter set found by the Empirical Bayes framework. Empirical Bayes framework is applied after selecting a subset of samples (5 samples; indicated by `Ind_sample` below), and thus the outcome contains only 5 samples. If `Nsample` <= 5, `final_obj` is identical to `best_obj`.
# - `best_set`: Best parameter set defined by Empirical Bayes framework.
# - `outs`: Outcome of BLADE for every possible combination of hyperparameters, used in the Empirical Bayes framework.
#
# - There are nan in mean and std matrix! NAs are filled with 0?
#
# full tumor type setup:
# - ngenes = 21706 common genes
# - ncells = 24, including all 16 tumor types
# - nsample = 546
#
# simple tumor type setup:
#- ngenes = 21706 common genes
#- ncells = 10, all tumor types are merged, including one NA type?
#- nsample = 546
# - marker genes = 900 (including 9 genes not shared)
print("start BLADE!")
Y = df_TCGA_shared.to_numpy()
mean = df_shared_mean.to_numpy()
sd = df_shared_std.to_numpy()
outfile = './BLADE/data/Puramfiltered_TCGA_corDEmarkers_BLADEout.pickle'
final_obj, best_obj, best_set, outs = Framework(
mean, sd, Y,
Alphas=hyperpars['Alpha'], Alpha0s=hyperpars['Alpha0'],
Kappa0s=hyperpars['Kappa0'], SYs=hyperpars['SY'],
Nrep=Nrep, Njob=Njob, Nrepfinal=Nrepfinal)
pickle.dump(
{
'final_obj': final_obj,
'best_obj': best_obj,
'best_set': best_set,
'outs' : outs
}, open(outfile, 'wb')
)
# In[ ]: