forked from popsim-consortium/stdpopsim
-
Notifications
You must be signed in to change notification settings - Fork 0
/
annotation_maint.py
38 lines (37 loc) · 1.31 KB
/
annotation_maint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import allel
import zarr
import numpy as np
import stdpopsim as stp
import logging
import warnings
import urllib.request
import os
logger = logging.getLogger(__name__)
# make root directory for zarr annotations
annot_path = "annotations"
os.mkdir(annot_path)
# loop through species and download
for spc in stp.all_species():
if spc.annotations:
address = spc.annotations[0].url
genome_version = os.path.basename(address).split(".")[1]
logger.info(f"Downloading GFF file {spc.id}")
tmp_path = f"{spc.id}.tmp.gff.gz"
try:
x, y = urllib.request.urlretrieve(address, tmp_path)
except FileNotFoundError:
warnings.warn("can't connnect to url")
logger.info(f"creating zarr arrays {spc.id}")
# create zarr store and zarr root
spc_path = os.path.join(annot_path, spc.id + "." + genome_version + ".zip")
store = zarr.ZipStore(spc_path)
root = zarr.group(store=store, overwrite=True)
x = allel.gff3_to_dataframe(tmp_path)
for col_name in x.columns:
if x[col_name].dtype == "O":
tmp = root.array(col_name, np.array(x[col_name], dtype=str))
else:
tmp = root.array(col_name, np.array(x[col_name]))
# cleanup
os.unlink(tmp_path)
store.close()