Skip to content

Commit

Permalink
add protein datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
Oxer11 committed Sep 16, 2022
1 parent 462fd86 commit 2be5c85
Show file tree
Hide file tree
Showing 18 changed files with 1,298 additions and 4 deletions.
30 changes: 26 additions & 4 deletions torchdrug/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,27 @@
from .zinc2m import ZINC2m
from .pcqm4m import PCQM4M
from .pubchem110m import PubChem110m

from .chembl_filtered import ChEMBLFiltered

from .beta_lactamase import BetaLactamase
from .fluorescence import Fluorescence
from .stability import Stability
from .solubility import Solubility
from .fold import Fold
from .binary_localization import BinaryLocalization
from .subcellular_localization import SubcellularLocalization
from .secondary_structure import SecondaryStructure
from .human_ppi import HumanPPI
from .yeast_ppi import YeastPPI
from .ppi_affinity import PPIAffinity
from .bindingdb import BindingDB
from .pdbbind import PDBBind
from .proteinnet import ProteinNet

from .enzyme_commission import EnzymeCommission
from .gene_ontology import GeneOntology
from .alphafolddb import AlphaFoldDB

from .fb15k import FB15k, FB15k237
from .wn18 import WN18, WN18RR
from .hetionet import Hetionet
Expand All @@ -32,10 +50,14 @@
from .pubmed import PubMed

__all__ = [
"BACE", "BBBP", "CEP", "ChEMBLFiltered", "ClinTox", "Delaney", "FreeSolv", "HIV", "Lipophilicity",
"BACE", "BBBP", "CEP", "ClinTox", "Delaney", "FreeSolv", "HIV", "Lipophilicity",
"Malaria", "MOSES", "MUV", "OPV", "QM8", "QM9", "SIDER", "Tox21", "ToxCast",
"USPTO50k", "ZINC250k",
"ZINC2m", "PCQM4M", "PubChem110m",
"ZINC2m", "PCQM4M", "PubChem110m", "ChEMBLFiltered",
"EnzymeCommission", "GeneOntology", "AlphaFoldDB",
"BetaLactamase", "Fluorescence", "Stability", "Solubility", "Fold",
"BinaryLocalization", "SubcellularLocalization", "SecondaryStructure",
"HumanPPI", "YeastPPI", "PPIAffinity", "BindingDB", "PDBBind", "ProteinNet",
"FB15k", "FB15k237", "WN18", "WN18RR", "Hetionet",
"Cora", "CiteSeer",
"Cora", "CiteSeer", "PubMed",
]
150 changes: 150 additions & 0 deletions torchdrug/datasets/alphafolddb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import os
import glob

from torchdrug import data, utils
from torchdrug.core import Registry as R


@R.register("datasets.AlphaFoldDB")
@utils.copy_args(data.ProteinDataset.load_pdbs, ignore=("filtered_pdb",))
class AlphaFoldDB(data.ProteinDataset):
"""
3D protein structures predicted by AlphaFold.
This dataset covers proteomes of 48 organisms, as well as the majority of Swiss-Prot.
Statistics:
See https://alphafold.ebi.ac.uk/download
Parameters:
path (str): path to store the dataset
species_id (int, optional): the id of species to be loaded. The species are numbered
by the order appeared on https://alphafold.ebi.ac.uk/download (0-20 for model
organism proteomes, 21 for Swiss-Prot)
split_id (int, optional): the id of split to be loaded. To avoid large memory consumption
for one dataset, we have cut each species into several splits, each of which contains
at most 22000 proteins.
verbose (int, optional): output verbose level
**kwargs
"""

urls = [
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006548_3702_ARATH_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001940_6239_CAEEL_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000559_237561_CANAL_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000437_7955_DANRE_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002195_44689_DICDI_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000803_7227_DROME_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000625_83333_ECOLI_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008827_3847_SOYBN_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000005640_9606_HUMAN_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008153_5671_LEIIN_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000805_243232_METJA_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000589_10090_MOUSE_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001584_83332_MYCTU_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000059680_39947_ORYSJ_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001450_36329_PLAF7_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002494_10116_RAT_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002311_559292_YEAST_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002485_284812_SCHPO_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008816_93061_STAA8_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002296_353153_TRYCC_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000007305_4577_MAIZE_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/swissprot_pdb_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001631_447093_AJECG_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006672_6279_BRUMA_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000799_192222_CAMJE_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000094526_86049_9EURO1_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000274756_318479_DRAME_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000325664_1352_ENTFC_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000053029_1442368_9EURO2_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000579_71421_HAEIN_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000429_85962_HELPY_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000007841_1125630_KLEPH_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008153_5671_LEIIN_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000078237_100816_9PEZI1_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000806_272631_MYCLE_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001584_83332_MYCTU_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000020681_1299332_MYCUL_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000535_242231_NEIG1_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006304_1133849_9NOCA1_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000024404_6282_ONCVO_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002059_502779_PARBA_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001450_36329_PLAF7_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002438_208964_PSEAE_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001014_99287_SALTY_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008854_6183_SCHMA_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002716_300267_SHIDS_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000018087_1391915_SPOS1_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008816_93061_STAA8_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000586_171101_STRR6_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000035681_6248_STRER_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000030665_36087_TRITR_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008524_185431_TRYB2_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002296_353153_TRYCC_v2.tar",
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000270924_6293_WUCBA_v2.tar"
]
md5s = [
"82b14d14404e39793cf73c1e0f083865", "9e26602ba2d9f233ef4fcf82703ddb59",
"60a09db1e1c47a98763d09879784f536", "a0ab562b7372f149673c4518f949501f",
"6205138b14fb7e7ec09b366e3e4f294b", "31f31359cd7254f82304e3886440bdd3",
"a590096e65461ed4eb092b2147b97f0b", "8f1e120f372995644a7101ad58e5b2ae",
"9a659c4aed2a8b833478dcd5fffc5fd8", "95d775f2ae271cf50a101c73335cd250",
"e5b12da43f5bd77298ca50e19706bdeb", "90e953abba9c8fe202e0adf825c0dfcc",
"38a11553c7e2d00482281e74f7daf321", "2bcdfe2c37154a355fe4e8150c279c13",
"580a55e56a44fed935f0101c37a8c4ab", "b8d08a9033d111429fadb4e25820f9f7",
"59d1167f414a86cbccfb204791fea0eb", "dfde6b44026f19a88f1abc8ac2798ce6",
"a1c2047a16130d61cac4db23b2f5b560", "e4d4b72df8d075aeb607dcb095210304",
"5cdad48c799ffd723636cae26433f1f9", "98a7c13987f578277bfb66ac48a1e242",
]
species_nsplit = [
2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 20,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1
]
split_length = 22000

def __init__(self, path, species_id=0, split_id=0, verbose=1, **kwargs):
path = os.path.expanduser(path)
if not os.path.exists(path):
os.makedirs(path)
self.path = path

species_name = os.path.basename(self.urls[species_id])[:-4]
if split_id >= self.species_nsplit[species_id]:
raise ValueError("Split id %d should be less than %d in species %s" %
(split_id, self.species_nsplit[species_id], species_name))
self.processed_file = "%s_%d.pkl.gz" % (species_name, split_id)
pkl_file = os.path.join(path, self.processed_file)

if os.path.exists(pkl_file):
self.load_pickle(pkl_file, verbose=verbose, **kwargs)
else:
tar_file = utils.download(self.urls[species_id], path, md5=self.md5s[species_id])
pdb_path = utils.extract(tar_file)
gz_files = sorted(glob.glob(os.path.join(pdb_path, "*.pdb.gz")))
pdb_files = []
index = slice(split_id * self.split_length, (split_id + 1) * self.split_length)
for gz_file in gz_files[index]:
pdb_files.append(utils.extract(gz_file))
self.load_pdbs(pdb_files, verbose=verbose, **kwargs)
self.save_pickle(pkl_file, verbose=verbose)

def get_item(self, index):
if getattr(self, "lazy", False):
protein = data.Protein.from_pdb(self.pdb_files[index], self.kwargs)
else:
protein = self.data[index].clone()
# Zhaocheng: I didn't see any code that creates sparse residue features
if hasattr(protein, "residue_feature"):
with protein.residue():
protein.residue_feature = protein.residue_feature.to_dense()
item = {"graph": protein}
if self.transform:
item = self.transform(item)
return item

def __repr__(self):
lines = [
"#sample: %d" % len(self),
]
return "%s(\n %s\n)" % (self.__class__.__name__, "\n ".join(lines))
51 changes: 51 additions & 0 deletions torchdrug/datasets/beta_lactamase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os

from torch.utils import data as torch_data

from torchdrug import data, utils
from torchdrug.core import Registry as R


@R.register("datasets.BetaLactamase")
@utils.copy_args(data.ProteinDataset.load_lmdbs, ignore=("target_fields",))
class BetaLactamase(data.ProteinDataset):
"""
The activity values of first-order mutants of the TEM-1 beta-lactamase protein.
Statistics:
- #Train: 4,158
- #Valid: 520
- #Test: 520
Parameters:
path (str): the path to store the dataset
verbose (int, optional): output verbose level
**kwargs
"""

url = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/beta_lactamase.tar.gz"
md5 = "65766a3969cc0e94b101d4063d204ba4"
splits = ["train", "valid", "test"]
target_fields = ["scaled_effect1"]

def __init__(self, path, verbose=1, **kwargs):
path = os.path.expanduser(path)
if not os.path.exists(path):
os.makedirs(path)
self.path = path

zip_file = utils.download(self.url, path, md5=self.md5)
data_path = utils.extract(zip_file)
lmdb_files = [os.path.join(data_path, "beta_lactamase/beta_lactamase_%s.lmdb" % split)
for split in self.splits]

self.load_lmdbs(lmdb_files, target_fields=self.target_fields, verbose=verbose, **kwargs)

def split(self):
offset = 0
splits = []
for num_sample in self.num_samples:
split = torch_data.Subset(self, range(offset, offset + num_sample))
splits.append(split)
offset += num_sample
return splits
52 changes: 52 additions & 0 deletions torchdrug/datasets/binary_localization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os

from torch.utils import data as torch_data

from torchdrug import data, utils
from torchdrug.core import Registry as R


@R.register("datasets.BinaryLocalization")
@utils.copy_args(data.ProteinDataset.load_lmdbs, ignore=("target_fields",))
class BinaryLocalization(data.ProteinDataset):
"""
Simpler version of the Subcellular Localization with binary labels indicating
whether a protein is membrane-bound or soluble.
Statistics:
- #Train: 5,161
- #Valid: 1,727
- #Test: 1,746
Parameters:
path (str): the path to store the dataset
verbose (int, optional): output verbose level
**kwargs
"""

url = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization_2.tar.gz"
md5 = "5d2309bf1c0c2aed450102578e434f4e"
splits = ["train", "valid", "test"]
target_fields = ["localization"]

def __init__(self, path, verbose=1, **kwargs):
path = os.path.expanduser(path)
if not os.path.exists(path):
os.makedirs(path)
self.path = path

zip_file = utils.download(self.url, path, md5=self.md5)
data_path = utils.extract(zip_file)
lmdb_files = [os.path.join(data_path, "subcellular_localization_2/subcellular_localization_2_%s.lmdb" % split)
for split in self.splits]

self.load_lmdbs(lmdb_files, target_fields=self.target_fields, verbose=verbose, **kwargs)

def split(self):
offset = 0
splits = []
for num_sample in self.num_samples:
split = torch_data.Subset(self, range(offset, offset + num_sample))
splits.append(split)
offset += num_sample
return splits
72 changes: 72 additions & 0 deletions torchdrug/datasets/bindingdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os

from rdkit import Chem

from torch.utils import data as torch_data

from torchdrug import data, utils
from torchdrug.core import Registry as R


@R.register("datasets.BindingDB")
@utils.copy_args(data.ProteinLigandDataset.load_lmdbs, ignore=("sequence_field", "smiles_field", "target_fields"))
class BindingDB(data.ProteinLigandDataset):
"""
The BindingDB dataset with binding affinity indicating the interaction strength
between pairs of protein and ligand.
Statistics:
- #Train: 7,900
- #Valid: 878
- #Test: 5,230
Parameters:
path (str): the path to store the dataset
verbose (int, optional): output verbose level
**kwargs
"""

url = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/BindingDB_Kd.tar.gz"
md5 = "0b207cb962c4945f9003fc020b415a74"
splits = ["train", "valid", "random_test", "holdout_test"]
target_fields = ["affinity"]

def __init__(self, path, verbose=1, **kwargs):
path = os.path.expanduser(path)
if not os.path.exists(path):
os.makedirs(path)
self.path = path
zip_file = utils.download(self.url, path, md5=self.md5)
data_path = utils.extract(zip_file)
lmdb_files = [os.path.join(data_path, "BindingDB_Kd_%s.lmdb" % split) for split in self.splits]

self.load_lmdbs(lmdb_files, sequence_field="target", smiles_field="drug",
target_fields=self.target_fields, verbose=verbose, **kwargs)

def split(self, keys=None):
keys = keys or self.splits
offset = 0
splits = []
for split_name, num_sample in zip(self.splits, self.num_samples):
if split_name in keys:
split = torch_data.Subset(self, range(offset, offset + num_sample))
splits.append(split)
offset += num_sample
return splits

def get_item(self, index):
if self.lazy:
graph1 = data.Protein.from_sequence(self.sequences[index], **self.kwargs)
mol = Chem.MolFromSmiles(self.smiles[index])
if not mol:
graph2 = None
else:
graph2 = data.Molecule.from_molecule(mol, **self.kwargs)
else:
graph1 = self.data[index][0]
graph2 = self.data[index][1]
item = {"graph1": graph1, "graph2": graph2}
item.update({k: v[index] for k, v in self.targets.items()})
if self.transform:
item = self.transform(item)
return item
Loading

0 comments on commit 2be5c85

Please sign in to comment.