Skip to content

Commit

Permalink
update dev version
Browse files Browse the repository at this point in the history
Conflicts:
	hase.py
	hdgwas/converter.py
	tools/minimac2hdf5.py
  • Loading branch information
adsnps committed Nov 26, 2016
2 parents d920365 + 12b54b8 commit a665110
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 25 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,7 @@ And then you need to install (or first uninstall) `scipy` and `numpy` python lib
## Citation
If you use HASE framework, please cite:
[G.Roshchupkin et al. HASE: Framework for efficient high-dimensional association analyses.
BioRxiv, 2016.](http://dx.doi.org/10.1101/037382)
[Roshchupkin, G. V. et al. HASE: Framework for efficient high-dimensional association analyses. Sci. Rep. 6, 36076; doi: 10.1038/srep36076 (2016)](http://www.nature.com/articles/srep36076)
## Licence
This project is licensed under GNU GPL v3.
Expand All @@ -147,3 +146,10 @@ Gennady V. Roshchupkin (Department of Epidemiology, Radiology and Medical Inform
Hieab H. Adams (Department of Epidemiology, Erasmus MC, Rotterdam, Netherlands)
## Contacts
If you have any questions/suggestions/comments or problems do not hesitate to contact us!
* [email protected]
* [email protected]
2 changes: 1 addition & 1 deletion data/reference_file_link.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
https://www.dropbox.com/s/usa8c8wbdm1wff9/1000Gp1v3.ref.gz?dl=0
https://www.dropbox.com/s/ffw366q4d8shfkf/1000Gp1v3.ref.gz?dl=0
https://www.dropbox.com/s/ajq6ngv1090splo/1000Gp1v3.ref_info.h5?dl=0
29 changes: 15 additions & 14 deletions hase.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,11 @@
###

#FLAGS
parser.add_argument('-hdf5', type=bool,default=True, help='flag for genotype data format')
parser.add_argument('-id', type=bool, default=False, help='Flag to convert minimac data to genotype per subject files first (default False)')
parser.add_argument('-pd_full', type=bool, default=False, help='For not HD association study')
parser.add_argument('-effect_intercept', type=bool, default=False, help='Flag for add study effect to PD regression model')
parser.add_argument('-permute_ph', type=bool, default=False, help='Flag for phenotype permutation')
parser.add_argument('-hdf5', action='store_true',default=True, help='flag for genotype data format')
parser.add_argument('-id', action='store_true', default=False, help='Flag to convert minimac data to genotype per subject files first (default False)')
parser.add_argument('-pd_full', action='store_true', default=False, help='For not HD association study')
parser.add_argument('-effect_intercept', action='store_true', default=False, help='Flag for add study effect to PD regression model')
parser.add_argument('-permute_ph', action='store_true', default=False, help='Flag for phenotype permutation')
#TODO (low) save genotype after MAF
###

Expand All @@ -120,14 +120,14 @@
MAPPER_CHUNK_SIZE=args.mapper_chunk
ARG_CHECKER=Checker()
print args
ARG_CHECKER.check(args,mode=args.mode)
os.environ['HASEOUT']=args.out


################################### CONVERTING ##############################

if args.mode=='converting':


#ARG_CHECKER.check(args,mode='converting')

R=Reader('genotype')
R.start(args.genotype[0])
Expand All @@ -152,7 +152,7 @@

elif args.mode=='encoding':


#ARG_CHECKER.check(args,mode='encoding')
mapper=Mapper()
mapper.genotype_names=args.study_name
mapper.chunk_size=MAPPER_CHUNK_SIZE
Expand Down Expand Up @@ -188,7 +188,7 @@
genotype=np.apply_along_axis(lambda x: flip*(x-2*flip_index) ,0,genotype)
genotype=genotype[:,row_index[0]]
encode_genotype=e.encode(genotype, data_type='genotype')
e.save_hdf5(encode_genotype,save_path=os.path.join(args.out, 'encode_genotype' ) )
e.save_hdf5(encode_genotype,os.path.join(args.out, 'encode_genotype' ) )
encode_genotype=None
gc.collect()

Expand Down Expand Up @@ -216,7 +216,7 @@

elif args.mode=='single-meta':


#ARG_CHECKER.check(args,mode='single-meta')
mapper=Mapper()
mapper.genotype_names=args.study_name
mapper.chunk_size=MAPPER_CHUNK_SIZE
Expand Down Expand Up @@ -247,7 +247,7 @@

elif args.mode=='meta-stage':


#ARG_CHECKER.check(args,mode='meta-stage')

##### Init data readers #####
if args.derivatives is None:
Expand Down Expand Up @@ -299,7 +299,8 @@
gen.append(Reader('genotype'))
gen[i].start(j,hdf5=args.hdf5, study_name=args.study_name[i], ID=False)


#for i in gen:
# i._data.link()
row_index, ids = study_indexes(phenotype=tuple(i.folder._data for i in phen),genotype=tuple(i.folder._data for i in gen),covariates=tuple(i.folder._data.metadata for i in pard))

while True:
Expand Down Expand Up @@ -420,7 +421,7 @@

elif args.mode=='regression':


#ARG_CHECKER.check(args,mode='regression')

print ('START regression mode...')
if args.mapper is None:
Expand Down Expand Up @@ -480,7 +481,7 @@
raise ValueError('You can not exclude or include variants to analysis without mapper!')
else:
raise ValueError('You can not run regression analysis with several genotype data without mapper!')

#mapper=None

Analyser=HaseAnalyser()
Analyser.threshold=args.thr
Expand Down
1 change: 1 addition & 0 deletions hdgwas/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def convert_probes(self, chunk_size=100000):
#@profile
def convert_genotypes(self):


chunk_size=self.split_size
if chunk_size is None:
raise ValueError('CONVERTER_SPLIT_SIZE does not define in config file!')
Expand Down
2 changes: 2 additions & 0 deletions hdgwas/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,9 +274,11 @@ def __init__(self, data_path , name, type='PLINK'):
self.name=name
self.id=np.array(pd.read_hdf(os.path.join(data_path,'individuals',self.name+'.h5'),'individuals').individual.tolist())
if "MAPPER" in os.environ:
print ('Reading ID from probes....')
self.names=pd.read_hdf(os.path.join(data_path,'probes', self.name +'.h5'),'probes', where='columns=[ID]').ID
self.shape=(len(self.id), len(self.names))
else:
print ('Use ID from mapper')
self.names=pd.HDFStore(os.path.join(data_path,'probes', self.name +'.h5'),'r')
self.shape=(len(self.id), self.names.get_storer('probes').nrows)
if type=='PLINK':
Expand Down
19 changes: 14 additions & 5 deletions hdgwas/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(self):
self.permutation=False
self.rsid_dic={}
self.result_folder=None
self.result_dump_size=100
self.result_dump_size=3


def summary(self):
Expand All @@ -81,13 +81,14 @@ def summary(self):
raise ValueError('Please set DF to Analyser!')

if self.result_folder is None:
self.result_folder=os.listdir(self.result_path)
self.result_folder=glob.glob( os.path.join(self.result_path, '*.npy') )

self.results['RSID']=np.array([])
self.results['p_value']=np.array([])
self.results['t-stat']=np.array([])
self.results['phenotype']=np.array([])
self.results['SE']=np.array([])
self.results['MAF']=np.array([])

files=[]
for i in range(self.result_dump_size):
Expand All @@ -98,12 +99,13 @@ def summary(self):
if len(files)!=0:
for i in files:
print i
d=np.load(os.path.join(self.result_path, i)).item()
d=np.load(i).item()
p_value=stats.t.sf(np.abs(d['t-stat']),self.DF)*2
self.results['t-stat']=np.append(self.results['t-stat'],d['t-stat'].flatten())
self.results['SE']=np.append(self.results['SE'],d['SE'].flatten())
self.results['RSID']=np.append(self.results['RSID'],d['index'])
self.results['phenotype']=np.append(self.results['phenotype'],d['phenotype'])
self.results['MAF']=np.append(self.results['MAF'],d['MAF'])
self.results['p_value']=np.append(self.results['p_value'],p_value.flatten())
else:
self.results=None
Expand All @@ -121,9 +123,10 @@ def save_result(self , phen_names):
mask = np.where(np.abs(self.t_stat) > 0)

if (len(mask[0]) != 0):
print ('Saving results to {}'.format(save_path))
t_save = self.t_stat[mask[0],mask[1],mask[2]]
se=self.SE[mask[0],mask[1],mask[2]]
result = {'phenotype': phen_names[mask[2]], 't-stat': t_save,'index':self.rsid[mask[0]],'SE':se}
result = {'phenotype': phen_names[mask[2]], 't-stat': t_save,'index':self.rsid[mask[0]],'SE':se, 'MAF':self.MAF[mask[0]]}

if not self.cluster:
np.save(os.path.join(save_path, str(self.result_index) + 'result.npy'), result)
Expand Down Expand Up @@ -503,7 +506,8 @@ def get(self,chunk_number=None):
self.processed=finish
self.include_ind=np.setxor1d(self.include_ind,self.exclude_ind)
if len(self.include_ind)>0:
ind=np.intersect1d(np.arange(start,finish),self.include_ind)
ind=self.include_ind
self.processed=self.n_keys #TODO (low) dirty hack
else:
ind=np.arange(start,finish)

Expand All @@ -518,6 +522,10 @@ def get(self,chunk_number=None):
if self.processed==self.n_keys:
return None, None

if self.include is not None and len(self.include_ind)==0:
print ('None of included ID found in genotype data!')
return None,None

ind=ind.astype('int')
indexes=self.values[ind,:]
keys=self.keys[ind]
Expand Down Expand Up @@ -669,6 +677,7 @@ def _get_id(notype):
index_c=np.array([np.where(id_c==i)[0][0] for i in common_id])

print ('There are {} common ids'.format(len(common_id)))
np.savetxt(os.path.join(os.environ['HASEOUT'],'study_common_id.txt'),common_id, fmt='%s')
if len(common_id)==0:
exit(0)

Expand Down
5 changes: 4 additions & 1 deletion tools/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import h5py
import tables
from hdgwas.tools import Timer,HaseAnalyser, Reference
import argparse
import pandas as pd
Expand All @@ -17,7 +19,7 @@
#TODO (low) add reference panel
args = parser.parse_args()
Analyser=HaseAnalyser()

print args

Analyser.DF=np.float(args.df)
Analyser.result_path=args.r
Expand All @@ -28,6 +30,7 @@
results['t-stat']=np.array([])
results['phenotype']=np.array([])
results['SE']=np.array([])
results['MAF']=np.array([])

while True:
Analyser.summary()
Expand Down
3 changes: 2 additions & 1 deletion tools/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
a = probes.select('probes', start = p_start_i, stop = p_stop_i)

if p==0:
print a.head()
if issubclass(type(a.iloc[0]['allele1']), np.str):
hashing=True
if "CHR" in a.columns and 'bp' in a.columns:
Expand Down Expand Up @@ -104,7 +105,7 @@
def f(x):
s=x.ID.split(':')
return s[0],s[1]
CHR_bp=a.apply(f, s[0],s[1], axis=1 )
CHR_bp=a.apply(f, axis=1 )
a['CHR'],a['bp']=zip(*CHR_bp)

a['counter_prob']=np.arange(p_start_i,p_stop_i,dtype='int32')
Expand Down
1 change: 0 additions & 1 deletion tools/minimac2hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import glob



def probes_minimac2hdf5(data_path, save_path,study_name, chunk_size=1000000):

if os.path.isfile(os.path.join(save_path,'probes',study_name+'.h5')):
Expand Down

0 comments on commit a665110

Please sign in to comment.