forked from DeepGraphLearning/torchdrug
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
1,298 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
import os | ||
import glob | ||
|
||
from torchdrug import data, utils | ||
from torchdrug.core import Registry as R | ||
|
||
|
||
@R.register("datasets.AlphaFoldDB") | ||
@utils.copy_args(data.ProteinDataset.load_pdbs, ignore=("filtered_pdb",)) | ||
class AlphaFoldDB(data.ProteinDataset): | ||
""" | ||
3D protein structures predicted by AlphaFold. | ||
This dataset covers proteomes of 48 organisms, as well as the majority of Swiss-Prot. | ||
Statistics: | ||
See https://alphafold.ebi.ac.uk/download | ||
Parameters: | ||
path (str): path to store the dataset | ||
species_id (int, optional): the id of species to be loaded. The species are numbered | ||
by the order appeared on https://alphafold.ebi.ac.uk/download (0-20 for model | ||
organism proteomes, 21 for Swiss-Prot) | ||
split_id (int, optional): the id of split to be loaded. To avoid large memory consumption | ||
for one dataset, we have cut each species into several splits, each of which contains | ||
at most 22000 proteins. | ||
verbose (int, optional): output verbose level | ||
**kwargs | ||
""" | ||
|
||
urls = [ | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006548_3702_ARATH_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001940_6239_CAEEL_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000559_237561_CANAL_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000437_7955_DANRE_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002195_44689_DICDI_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000803_7227_DROME_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000625_83333_ECOLI_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008827_3847_SOYBN_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000005640_9606_HUMAN_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008153_5671_LEIIN_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000805_243232_METJA_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000589_10090_MOUSE_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001584_83332_MYCTU_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000059680_39947_ORYSJ_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001450_36329_PLAF7_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002494_10116_RAT_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002311_559292_YEAST_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002485_284812_SCHPO_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008816_93061_STAA8_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002296_353153_TRYCC_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000007305_4577_MAIZE_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/swissprot_pdb_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001631_447093_AJECG_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006672_6279_BRUMA_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000799_192222_CAMJE_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000094526_86049_9EURO1_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000274756_318479_DRAME_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000325664_1352_ENTFC_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000053029_1442368_9EURO2_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000579_71421_HAEIN_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000429_85962_HELPY_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000007841_1125630_KLEPH_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008153_5671_LEIIN_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000078237_100816_9PEZI1_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000806_272631_MYCLE_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001584_83332_MYCTU_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000020681_1299332_MYCUL_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000535_242231_NEIG1_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006304_1133849_9NOCA1_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000024404_6282_ONCVO_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002059_502779_PARBA_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001450_36329_PLAF7_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002438_208964_PSEAE_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001014_99287_SALTY_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008854_6183_SCHMA_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002716_300267_SHIDS_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000018087_1391915_SPOS1_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008816_93061_STAA8_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000586_171101_STRR6_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000035681_6248_STRER_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000030665_36087_TRITR_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008524_185431_TRYB2_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002296_353153_TRYCC_v2.tar", | ||
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000270924_6293_WUCBA_v2.tar" | ||
] | ||
md5s = [ | ||
"82b14d14404e39793cf73c1e0f083865", "9e26602ba2d9f233ef4fcf82703ddb59", | ||
"60a09db1e1c47a98763d09879784f536", "a0ab562b7372f149673c4518f949501f", | ||
"6205138b14fb7e7ec09b366e3e4f294b", "31f31359cd7254f82304e3886440bdd3", | ||
"a590096e65461ed4eb092b2147b97f0b", "8f1e120f372995644a7101ad58e5b2ae", | ||
"9a659c4aed2a8b833478dcd5fffc5fd8", "95d775f2ae271cf50a101c73335cd250", | ||
"e5b12da43f5bd77298ca50e19706bdeb", "90e953abba9c8fe202e0adf825c0dfcc", | ||
"38a11553c7e2d00482281e74f7daf321", "2bcdfe2c37154a355fe4e8150c279c13", | ||
"580a55e56a44fed935f0101c37a8c4ab", "b8d08a9033d111429fadb4e25820f9f7", | ||
"59d1167f414a86cbccfb204791fea0eb", "dfde6b44026f19a88f1abc8ac2798ce6", | ||
"a1c2047a16130d61cac4db23b2f5b560", "e4d4b72df8d075aeb607dcb095210304", | ||
"5cdad48c799ffd723636cae26433f1f9", "98a7c13987f578277bfb66ac48a1e242", | ||
] | ||
species_nsplit = [ | ||
2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 20, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | ||
] | ||
split_length = 22000 | ||
|
||
def __init__(self, path, species_id=0, split_id=0, verbose=1, **kwargs): | ||
path = os.path.expanduser(path) | ||
if not os.path.exists(path): | ||
os.makedirs(path) | ||
self.path = path | ||
|
||
species_name = os.path.basename(self.urls[species_id])[:-4] | ||
if split_id >= self.species_nsplit[species_id]: | ||
raise ValueError("Split id %d should be less than %d in species %s" % | ||
(split_id, self.species_nsplit[species_id], species_name)) | ||
self.processed_file = "%s_%d.pkl.gz" % (species_name, split_id) | ||
pkl_file = os.path.join(path, self.processed_file) | ||
|
||
if os.path.exists(pkl_file): | ||
self.load_pickle(pkl_file, verbose=verbose, **kwargs) | ||
else: | ||
tar_file = utils.download(self.urls[species_id], path, md5=self.md5s[species_id]) | ||
pdb_path = utils.extract(tar_file) | ||
gz_files = sorted(glob.glob(os.path.join(pdb_path, "*.pdb.gz"))) | ||
pdb_files = [] | ||
index = slice(split_id * self.split_length, (split_id + 1) * self.split_length) | ||
for gz_file in gz_files[index]: | ||
pdb_files.append(utils.extract(gz_file)) | ||
self.load_pdbs(pdb_files, verbose=verbose, **kwargs) | ||
self.save_pickle(pkl_file, verbose=verbose) | ||
|
||
def get_item(self, index): | ||
if getattr(self, "lazy", False): | ||
protein = data.Protein.from_pdb(self.pdb_files[index], self.kwargs) | ||
else: | ||
protein = self.data[index].clone() | ||
# Zhaocheng: I didn't see any code that creates sparse residue features | ||
if hasattr(protein, "residue_feature"): | ||
with protein.residue(): | ||
protein.residue_feature = protein.residue_feature.to_dense() | ||
item = {"graph": protein} | ||
if self.transform: | ||
item = self.transform(item) | ||
return item | ||
|
||
def __repr__(self): | ||
lines = [ | ||
"#sample: %d" % len(self), | ||
] | ||
return "%s(\n %s\n)" % (self.__class__.__name__, "\n ".join(lines)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import os | ||
|
||
from torch.utils import data as torch_data | ||
|
||
from torchdrug import data, utils | ||
from torchdrug.core import Registry as R | ||
|
||
|
||
@R.register("datasets.BetaLactamase") | ||
@utils.copy_args(data.ProteinDataset.load_lmdbs, ignore=("target_fields",)) | ||
class BetaLactamase(data.ProteinDataset): | ||
""" | ||
The activity values of first-order mutants of the TEM-1 beta-lactamase protein. | ||
Statistics: | ||
- #Train: 4,158 | ||
- #Valid: 520 | ||
- #Test: 520 | ||
Parameters: | ||
path (str): the path to store the dataset | ||
verbose (int, optional): output verbose level | ||
**kwargs | ||
""" | ||
|
||
url = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/beta_lactamase.tar.gz" | ||
md5 = "65766a3969cc0e94b101d4063d204ba4" | ||
splits = ["train", "valid", "test"] | ||
target_fields = ["scaled_effect1"] | ||
|
||
def __init__(self, path, verbose=1, **kwargs): | ||
path = os.path.expanduser(path) | ||
if not os.path.exists(path): | ||
os.makedirs(path) | ||
self.path = path | ||
|
||
zip_file = utils.download(self.url, path, md5=self.md5) | ||
data_path = utils.extract(zip_file) | ||
lmdb_files = [os.path.join(data_path, "beta_lactamase/beta_lactamase_%s.lmdb" % split) | ||
for split in self.splits] | ||
|
||
self.load_lmdbs(lmdb_files, target_fields=self.target_fields, verbose=verbose, **kwargs) | ||
|
||
def split(self): | ||
offset = 0 | ||
splits = [] | ||
for num_sample in self.num_samples: | ||
split = torch_data.Subset(self, range(offset, offset + num_sample)) | ||
splits.append(split) | ||
offset += num_sample | ||
return splits |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import os | ||
|
||
from torch.utils import data as torch_data | ||
|
||
from torchdrug import data, utils | ||
from torchdrug.core import Registry as R | ||
|
||
|
||
@R.register("datasets.BinaryLocalization") | ||
@utils.copy_args(data.ProteinDataset.load_lmdbs, ignore=("target_fields",)) | ||
class BinaryLocalization(data.ProteinDataset): | ||
""" | ||
Simpler version of the Subcellular Localization with binary labels indicating | ||
whether a protein is membrane-bound or soluble. | ||
Statistics: | ||
- #Train: 5,161 | ||
- #Valid: 1,727 | ||
- #Test: 1,746 | ||
Parameters: | ||
path (str): the path to store the dataset | ||
verbose (int, optional): output verbose level | ||
**kwargs | ||
""" | ||
|
||
url = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization_2.tar.gz" | ||
md5 = "5d2309bf1c0c2aed450102578e434f4e" | ||
splits = ["train", "valid", "test"] | ||
target_fields = ["localization"] | ||
|
||
def __init__(self, path, verbose=1, **kwargs): | ||
path = os.path.expanduser(path) | ||
if not os.path.exists(path): | ||
os.makedirs(path) | ||
self.path = path | ||
|
||
zip_file = utils.download(self.url, path, md5=self.md5) | ||
data_path = utils.extract(zip_file) | ||
lmdb_files = [os.path.join(data_path, "subcellular_localization_2/subcellular_localization_2_%s.lmdb" % split) | ||
for split in self.splits] | ||
|
||
self.load_lmdbs(lmdb_files, target_fields=self.target_fields, verbose=verbose, **kwargs) | ||
|
||
def split(self): | ||
offset = 0 | ||
splits = [] | ||
for num_sample in self.num_samples: | ||
split = torch_data.Subset(self, range(offset, offset + num_sample)) | ||
splits.append(split) | ||
offset += num_sample | ||
return splits |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import os | ||
|
||
from rdkit import Chem | ||
|
||
from torch.utils import data as torch_data | ||
|
||
from torchdrug import data, utils | ||
from torchdrug.core import Registry as R | ||
|
||
|
||
@R.register("datasets.BindingDB") | ||
@utils.copy_args(data.ProteinLigandDataset.load_lmdbs, ignore=("sequence_field", "smiles_field", "target_fields")) | ||
class BindingDB(data.ProteinLigandDataset): | ||
""" | ||
The BindingDB dataset with binding affinity indicating the interaction strength | ||
between pairs of protein and ligand. | ||
Statistics: | ||
- #Train: 7,900 | ||
- #Valid: 878 | ||
- #Test: 5,230 | ||
Parameters: | ||
path (str): the path to store the dataset | ||
verbose (int, optional): output verbose level | ||
**kwargs | ||
""" | ||
|
||
url = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/BindingDB_Kd.tar.gz" | ||
md5 = "0b207cb962c4945f9003fc020b415a74" | ||
splits = ["train", "valid", "random_test", "holdout_test"] | ||
target_fields = ["affinity"] | ||
|
||
def __init__(self, path, verbose=1, **kwargs): | ||
path = os.path.expanduser(path) | ||
if not os.path.exists(path): | ||
os.makedirs(path) | ||
self.path = path | ||
zip_file = utils.download(self.url, path, md5=self.md5) | ||
data_path = utils.extract(zip_file) | ||
lmdb_files = [os.path.join(data_path, "BindingDB_Kd_%s.lmdb" % split) for split in self.splits] | ||
|
||
self.load_lmdbs(lmdb_files, sequence_field="target", smiles_field="drug", | ||
target_fields=self.target_fields, verbose=verbose, **kwargs) | ||
|
||
def split(self, keys=None): | ||
keys = keys or self.splits | ||
offset = 0 | ||
splits = [] | ||
for split_name, num_sample in zip(self.splits, self.num_samples): | ||
if split_name in keys: | ||
split = torch_data.Subset(self, range(offset, offset + num_sample)) | ||
splits.append(split) | ||
offset += num_sample | ||
return splits | ||
|
||
def get_item(self, index): | ||
if self.lazy: | ||
graph1 = data.Protein.from_sequence(self.sequences[index], **self.kwargs) | ||
mol = Chem.MolFromSmiles(self.smiles[index]) | ||
if not mol: | ||
graph2 = None | ||
else: | ||
graph2 = data.Molecule.from_molecule(mol, **self.kwargs) | ||
else: | ||
graph1 = self.data[index][0] | ||
graph2 = self.data[index][1] | ||
item = {"graph1": graph1, "graph2": graph2} | ||
item.update({k: v[index] for k, v in self.targets.items()}) | ||
if self.transform: | ||
item = self.transform(item) | ||
return item |
Oops, something went wrong.