From 09ffe181a3b025a770e157ace3e2f64436b1919d Mon Sep 17 00:00:00 2001 From: Zhaocheng Zhu Date: Sun, 18 Sep 2022 22:35:52 -0400 Subject: [PATCH] add papers implemented --- doc/source/api/layers.rst | 5 +++++ doc/source/bibliography.rst | 7 ++++++ doc/source/conf.py | 2 +- doc/source/paper.rst | 35 +++++++++++++++++++++++++++++ torchdrug/data/dataset.py | 5 ++--- torchdrug/data/protein.py | 10 ++++----- torchdrug/layers/common.py | 14 ++++++++++-- torchdrug/layers/geometry/graph.py | 8 +++---- torchdrug/models/bert.py | 2 +- torchdrug/models/esm.py | 2 +- torchdrug/models/lstm.py | 2 +- torchdrug/models/physicochemical.py | 2 +- torchdrug/models/statistic.py | 2 +- torchdrug/tasks/pretrain.py | 4 ++-- 14 files changed, 77 insertions(+), 23 deletions(-) diff --git a/doc/source/api/layers.rst b/doc/source/api/layers.rst index 79c5e663..eb25d6cf 100644 --- a/doc/source/api/layers.rst +++ b/doc/source/api/layers.rst @@ -21,6 +21,11 @@ MutualInformation .. autoclass:: MutualInformation :members: +SinusoidalPositionEmbedding +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autoclass:: SinusoidalPositionEmbedding + :members: + PairNorm ^^^^^^^^ .. autoclass:: PairNorm diff --git a/doc/source/bibliography.rst b/doc/source/bibliography.rst index 015f0338..a133f86f 100644 --- a/doc/source/bibliography.rst +++ b/doc/source/bibliography.rst @@ -57,6 +57,13 @@ .. Retrosynthesis .. _G2Gs: https://arxiv.org/pdf/2003.12725.pdf +.. Protein Representation Learning +.. _TAPE: https://proceedings.neurips.cc/paper/2019/file/37f65c068b7723cd7809ee2d31d7861c-Paper.pdf +.. _ProteinCNN: https://arxiv.org/pdf/2011.03443.pdf +.. _ESM: https://www.biorxiv.org/content/10.1101/622803v1.full.pdf +.. _GearNet: https://arxiv.org/pdf/2203.06125.pdf + +.. Knowledge Graph Reasoning .. _TransE: http://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data.pdf .. _DistMult: https://arxiv.org/pdf/1412.6575.pdf .. _ComplEx: http://proceedings.mlr.press/v48/trouillon16.pdf diff --git a/doc/source/conf.py b/doc/source/conf.py index ebadb05b..a21fb9ae 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -85,7 +85,7 @@ # html_theme = "furo" -html_logo = "../../asset/logo.svg" +html_logo = "../../asset/torchdrug_logo_full.svg" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the diff --git a/doc/source/paper.rst b/doc/source/paper.rst index c22a7aeb..a1b2aa56 100644 --- a/doc/source/paper.rst +++ b/doc/source/paper.rst @@ -147,6 +147,41 @@ Retrosynthesis :class:`SynthonCompletion `, :class:`Retrosynthesis ` +Protein Representation Learning +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. `Evaluating Protein Transfer Learning with TAPE `_ + + Roshan Rao, Nicholas Bhattacharya, Neil Thomas, Yan Duan, Xi Chen, John Canny, Pieter Abbeel, Yun S Song. NeurIPS 2019. + + :class:`SinusoidalPositionEmbedding ` + :class:`SelfAttentionBlock ` + :class:`ProteinResNetBlock ` + :class:`ProteinBERTBlock ` + :class:`ProteinResNet ` + :class:`ProteinLSTM ` + :class:`ProteinBERT ` + +2. `Is Transfer Learning Necessary for Protein Landscape Prediction? `_ + + Amir Shanehsazzadeh, David Belanger, David Dohan. arXiv 2020. + + :class:`ProteinCNN ` + +3. `Biological Structure and Function Emerge from Scaling Unsupervised Learning to 250 Million Protein Sequences `_ + + Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, Rob Fergus. PNAS 2021. + + :class:`EvolutionaryScaleModeling ` + +4. `Protein Representation Learning by Geometric Structure Pretraining `_ + + Zuobai Zhang, Minghao Xu, Arian Jamasb, Vijil Chenthamarakshan, Aurélie Lozano, Payel Das, Jian Tang. arXiv 2022. + + :class:`GeometricRelationalGraphConv ` + :class:`GeometryAwareRelationalGraphNeuralNetwork ` + :mod:`torchdrug.layers.geometry` + Knowledge Graph Reasoning ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/torchdrug/data/dataset.py b/torchdrug/data/dataset.py index 65d710df..ce79aac7 100644 --- a/torchdrug/data/dataset.py +++ b/torchdrug/data/dataset.py @@ -715,13 +715,12 @@ def load_lmdbs(self, lmdb_files, sequence_field="primary", target_fields=None, n self.num_samples = num_samples @utils.copy_args(data.Protein.from_molecule) - def load_pdbs(self, pdb_files, sanitize=True, transform=None, lazy=False, verbose=0, **kwargs): + def load_pdbs(self, pdb_files, transform=None, lazy=False, verbose=0, **kwargs): """ Load the dataset from pdb files. Parameters: pdb_files (list of str): pdb file names - sanitize (bool, optional): whether to sanitize the molecule transform (Callable, optional): protein sequence transformation function lazy (bool, optional): if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time. @@ -744,7 +743,7 @@ def load_pdbs(self, pdb_files, sanitize=True, transform=None, lazy=False, verbos pdb_files = tqdm(pdb_files, "Constructing proteins from pdbs") for i, pdb_file in enumerate(pdb_files): if not lazy or i == 0: - mol = Chem.MolFromPDBFile(pdb_file, sanitize=sanitize) + mol = Chem.MolFromPDBFile(pdb_file) if not mol: logger.debug("Can't construct molecule from pdb file `%s`. Ignore this sample." % pdb_file) continue diff --git a/torchdrug/data/protein.py b/torchdrug/data/protein.py index 23b7a36c..6e34b7ef 100644 --- a/torchdrug/data/protein.py +++ b/torchdrug/data/protein.py @@ -305,7 +305,7 @@ def from_sequence(cls, sequence, atom_feature="default", bond_feature="default", @classmethod @utils.deprecated_alias(node_feature="atom_feature", edge_feature="bond_feature", graph_feature="mol_feature") def from_pdb(cls, pdb_file, atom_feature="default", bond_feature="default", residue_feature="default", - mol_feature=None, kekulize=False, sanitize=False): + mol_feature=None, kekulize=False): """ Create a protein from a PDB file. @@ -319,11 +319,10 @@ def from_pdb(cls, pdb_file, atom_feature="default", bond_feature="default", resi Note this only affects the relation in ``edge_list``. For ``bond_type``, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored. - sanitize (bool, optional): whether to sanitize the molecule """ if not os.path.exists(pdb_file): raise FileNotFoundError("No such file `%s`" % pdb_file) - mol = Chem.MolFromPDBFile(pdb_file, sanitize=sanitize) + mol = Chem.MolFromPDBFile(pdb_file) if mol is None: raise ValueError("RDKit cannot read PDB file `%s`" % pdb_file) return cls.from_molecule(mol, atom_feature, bond_feature, residue_feature, mol_feature, kekulize) @@ -1052,7 +1051,7 @@ def from_sequence(cls, sequences, atom_feature="default", bond_feature="default" @classmethod @utils.deprecated_alias(node_feature="atom_feature", edge_feature="bond_feature", graph_feature="mol_feature") def from_pdb(cls, pdb_files, atom_feature="default", bond_feature="default", residue_feature="default", - mol_feature=None, kekulize=False, sanitize=False): + mol_feature=None, kekulize=False): """ Create a protein from a list of PDB files. @@ -1066,11 +1065,10 @@ def from_pdb(cls, pdb_files, atom_feature="default", bond_feature="default", res Note this only affects the relation in ``edge_list``. For ``bond_type``, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored. - sanitize (bool, optional): whether to sanitize the molecule """ mols = [] for pdb_file in pdb_files: - mol = Chem.MolFromPDBFile(pdb_file, sanitize=sanitize) + mol = Chem.MolFromPDBFile(pdb_file) mols.append(mol) return cls.from_molecule(mols, atom_feature, bond_feature, residue_feature, mol_feature, kekulize) diff --git a/torchdrug/layers/common.py b/torchdrug/layers/common.py index b4676343..98d06d76 100644 --- a/torchdrug/layers/common.py +++ b/torchdrug/layers/common.py @@ -74,7 +74,7 @@ def forward(self, input): class GaussianSmearing(nn.Module): r""" Gaussian smearing from - `SchNet: A continuous-filter convolutional neural network for modeling quantum interactions`_. + `SchNet: A continuous-filter convolutional neural network for modeling quantum interactions`_.`` There are two modes for Gaussian smearing. @@ -167,7 +167,7 @@ def forward(self, graph, input): class InstanceNorm(nn.modules.instancenorm._InstanceNorm): """ Instance normalization for graphs. This layer follows the definition in - `GraphNorm: A Principled Approach to Accelerating Graph Neural Network Training`. + `GraphNorm: A Principled Approach to Accelerating Graph Neural Network Training`_. .. _GraphNorm\: A Principled Approach to Accelerating Graph Neural Network Training: https://arxiv.org/pdf/2009.03294.pdf @@ -325,6 +325,15 @@ def forward(self, *args, **kwargs): class SinusoidalPositionEmbedding(nn.Module): + """ + Positional embedding based on sine and cosine functions, proposed in `Attention Is All You Need`_. + + .. _Attention Is All You Need: + https://arxiv.org/pdf/1706.03762.pdf + + Parameters: + output_dim (int): output dimension + """ def __init__(self, output_dim): super(SinusoidalPositionEmbedding, self).__init__() @@ -332,6 +341,7 @@ def __init__(self, output_dim): self.register_buffer("inverse_frequency", inverse_frequency) def forward(self, input): + """""" # input: [B, L, ...] positions = torch.arange(input.shape[1] - 1, -1, -1.0, dtype=input.dtype, device=input.device) sinusoidal_input = torch.outer(positions, self.inverse_frequency) diff --git a/torchdrug/layers/geometry/graph.py b/torchdrug/layers/geometry/graph.py index 71c21648..1841d5b7 100644 --- a/torchdrug/layers/geometry/graph.py +++ b/torchdrug/layers/geometry/graph.py @@ -46,8 +46,8 @@ def __init__(self, node_layers=None, edge_layers=None, edge_feature="residue_typ def edge_residue_type(self, graph, edge_list): node_in, node_out, _ = edge_list.t() residue_in, residue_out = graph.atom2residue[node_in], graph.atom2residue[node_out] - in_residue_type = graph.edge_residue_type[residue_in] - out_residue_type = graph.edge_residue_type[residue_out] + in_residue_type = graph.residue_type[residue_in] + out_residue_type = graph.residue_type[residue_out] return torch.cat([ functional.one_hot(in_residue_type, len(data.Protein.residue2id)), @@ -57,8 +57,8 @@ def edge_residue_type(self, graph, edge_list): def edge_gearnet(self, graph, edge_list, num_relation): node_in, node_out, r = edge_list.t() residue_in, residue_out = graph.atom2residue[node_in], graph.atom2residue[node_out] - in_residue_type = graph.edge_residue_type[residue_in] - out_residue_type = graph.edge_residue_type[residue_out] + in_residue_type = graph.residue_type[residue_in] + out_residue_type = graph.residue_type[residue_out] sequential_dist = torch.abs(residue_in - residue_out) spatial_dist = (graph.node_position[node_in] - graph.node_position[node_out]).norm(dim=-1) diff --git a/torchdrug/models/bert.py b/torchdrug/models/bert.py index b7059374..6d700574 100644 --- a/torchdrug/models/bert.py +++ b/torchdrug/models/bert.py @@ -67,7 +67,7 @@ def forward(self, graph, input, all_loss=None, metric=None): dict with ``residue_feature`` and ``graph_feature`` fields: residue representations of shape :math:`(|V_{res}|, d)`, graph representations of shape :math:`(n, d)` """ - input = graph.edge_residue_type + input = graph.residue_type size_ext = graph.num_residues # Prepend BOS bos = torch.ones(graph.batch_size, dtype=torch.long, device=self.device) * self.num_residue_type diff --git a/torchdrug/models/esm.py b/torchdrug/models/esm.py index 460ae70b..65b5a599 100644 --- a/torchdrug/models/esm.py +++ b/torchdrug/models/esm.py @@ -100,7 +100,7 @@ def forward(self, graph, input, all_loss=None, metric=None): dict with ``residue_feature`` and ``graph_feature`` fields: residue representations of shape :math:`(|V_{res}|, d)`, graph representations of shape :math:`(n, d)` """ - input = graph.edge_residue_type + input = graph.residue_type input = self.mapping[input] size = graph.num_residues if (size > self.max_input_length).any(): diff --git a/torchdrug/models/lstm.py b/torchdrug/models/lstm.py index 74f36816..834b86a7 100644 --- a/torchdrug/models/lstm.py +++ b/torchdrug/models/lstm.py @@ -4,7 +4,7 @@ from torch import nn from torch.nn import functional as F -from torchdrug import core, layers +from torchdrug import core from torchdrug.layers import functional from torchdrug.core import Registry as R diff --git a/torchdrug/models/physicochemical.py b/torchdrug/models/physicochemical.py index 0cf82b39..4a055176 100644 --- a/torchdrug/models/physicochemical.py +++ b/torchdrug/models/physicochemical.py @@ -88,7 +88,7 @@ def forward(self, graph, input, all_loss=None, metric=None): Returns: dict with ``graph_feature`` field: graph representations of shape :math:`(n, d)` """ - input = graph.edge_residue_type + input = graph.residue_type x = self.property[input] # num_residue * 8 x_mean = scatter_mean(x, graph.residue2graph, dim=0, dim_size=graph.batch_size) # batch_size * 8 diff --git a/torchdrug/models/statistic.py b/torchdrug/models/statistic.py index 6c3eacf7..941cde81 100644 --- a/torchdrug/models/statistic.py +++ b/torchdrug/models/statistic.py @@ -58,7 +58,7 @@ def forward(self, graph, input, all_loss=None, metric=None): Returns: dict with ``graph_feature`` field: graph representations of shape :math:`(n, d)` """ - input = graph.edge_residue_type + input = graph.residue_type index = input[:-1] * self.num_residue_type + input[1:] index = graph.residue2graph[:-1] * self.input_dim + index diff --git a/torchdrug/tasks/pretrain.py b/torchdrug/tasks/pretrain.py index 3270c78f..5d0f28e9 100644 --- a/torchdrug/tasks/pretrain.py +++ b/torchdrug/tasks/pretrain.py @@ -133,10 +133,10 @@ def predict_and_target(self, batch, all_loss=None, metric=None): input = graph.node_feature.float() input[node_index] = 0 else: - target = graph.edge_residue_type[node_index] + target = graph.residue_type[node_index] with graph.residue(): graph.residue_feature[node_index] = 0 - graph.edge_residue_type[node_index] = 0 + graph.residue_type[node_index] = 0 # Generate masked edge features. Any better implementation? if self.graph_construction_model: graph = self.graph_construction_model.apply_edge_layer(graph)