1.5.0 patch3 (a-r-j#189)

* refactor PDB download to support parallelisation * add tests for PDB download * refactor PDB download function call in getcontacts * add salt bridges and vdw interactions * use multiprocess downloading in torch datasets * add additional edge funcs * add residue_id to dfs and additional constants * update tests to account for residue id column * update dl func to check for obsolete PDBs * fix type error on paths * fix obsolete test * add bad pdb attr * add return to PDB download util * Merge master -> Patch (a-r-j#190) * update bioservices to account for new UniProt API (a-r-j#187) * update bioservices to account for new UniProt API * update changelog * Update PULL_REQUEST_TEMPLATE.md * Graph plots (a-r-j#186) * Fix param name typo in function docstring * add scaling node size by "rsa" feature as well as degree * add option for scaling node size by meiler embedding dimensions. Takes negative values to be zero. * remove walrus operator := for compatability * Add type hints * Update changelog Co-authored-by: Arian Jamasb <[email protected]> Co-authored-by: Cam <[email protected]> * patch torch geometric dataset and update changelog * black, isort * isort Co-authored-by: Cam <[email protected]>
kamurani · Jul 11, 2022 · 553fd26 · 553fd26
1 parent b7721de
commit 553fd26
Show file tree

Hide file tree

Showing 9 changed files with 717 additions and 116 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,22 @@
 ### 1.5.1
 
-#### Changes
+#### Protein
 
 * [Feature] - [#186](https://github.com/a-r-j/graphein/pull/186) adds support for scaling node sizes in plots by a computed feature. Contribution by @cimranm
+* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) adds support for parallelised download from the PDB.
+* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) adds support for: van der waals interactions, vdw clashes, pi-stacking interactions, t_stacking interactions, backbone carbonyl-carbonyl interactions, salt bridges
+* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) adds a `residue_id` column to PDB dfs to enable easier accounting in atom graphs.
+* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) refactors torch geometric datasets to use parallelised download for faster dataset preparation.
+
+#### Bugfixes
+
 * [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes.
+* [Patch] - [#189](https://github.com/a-r-j/graphein/pull/189) fixes bug where chains and PDB identifiers were not properly aligned in `ml.ProteinGraphDataset`.
+
+#### Breaking Changes
+
+* [#189](https://github.com/a-r-j/graphein/pull/189/) refactors PDB download util. Now returns path to download file, does not accept a config object but instead receives the output directory path directly.
+
 
 ### 1.5.0
 

diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py
@@ -17,7 +17,11 @@
 from graphein.ml.conversion import GraphFormatConvertor
 from graphein.protein.config import ProteinGraphConfig
 from graphein.protein.graphs import construct_graphs_mp
-from graphein.protein.utils import download_alphafold_structure, download_pdb
+from graphein.protein.utils import (
+    download_alphafold_structure,
+    download_pdb,
+    download_pdb_multiprocessing,
+)
 from graphein.utils.utils import import_message
 
 try:
@@ -138,6 +142,9 @@ def __init__(
         elif self.uniprot_ids:
             self.structures = uniprot_ids
         self.af_version = af_version
+        self.bad_pdbs: List[
+            str
+        ] = []  # list of pdb codes that failed to download
 
         # Labels & Chains
         self.graph_label_map = graph_label_map
@@ -173,7 +180,23 @@ def download(self):
         """Download the PDB files from RCSB or Alphafold."""
         self.config.pdb_dir = Path(self.raw_dir)
         if self.pdb_codes:
-            [download_pdb(self.config, pdb) for pdb in tqdm(self.pdb_codes)]
+            # Only download PDBs that are not already downloaded
+            to_download = [
+                pdb
+                for pdb in set(self.pdb_codes)
+                if not os.path.exists(Path(self.raw_dir) / f"{pdb}.pdb")
+            ]
+            download_pdb_multiprocessing(
+                to_download,
+                self.raw_dir,
+                max_workers=self.num_cores,
+                strict=False,
+            )
+            self.bad_pdbs = self.bad_pdbs + [
+                pdb
+                for pdb in set(self.pdb_codes)
+                if not os.path.exists(Path(self.raw_dir) / f"{pdb}.pdb")
+            ]
         if self.uniprot_ids:
             [
                 download_alphafold_structure(
@@ -375,6 +398,7 @@ def __init__(
         self.graph_label_map = graph_label_map
         self.node_label_map = node_label_map
         self.chain_selection_map = chain_selection_map
+        self.bad_pdbs: List[str] = []
 
         # Configs
         self.config = graphein_config
@@ -404,7 +428,23 @@ def download(self):
         """Download the PDB files from RCSB or Alphafold."""
         self.config.pdb_dir = Path(self.raw_dir)
         if self.pdb_codes:
-            [download_pdb(self.config, pdb) for pdb in tqdm(self.pdb_codes)]
+            # Only download undownloaded PDBs
+            to_download = [
+                pdb
+                for pdb in set(self.pdb_codes)
+                if not os.path.exists(Path(self.raw_dir) / f"{pdb}.pdb")
+            ]
+            download_pdb_multiprocessing(
+                to_download,
+                self.raw_dir,
+                max_workers=self.num_cores,
+                strict=False,
+            )
+            self.bad_pdbs = self.bad_pdbs + [
+                pdb
+                for pdb in set(self.pdb_codes)
+                if not os.path.exists(Path(self.raw_dir) / f"{pdb}.pdb")
+            ]
         if self.uniprot_ids:
             [
                 download_alphafold_structure(
@@ -458,7 +498,7 @@ def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]:
                     self.chain_selection_map[pdb]
                     if pdb in self.chain_selection_map.keys()
                     else "all"
-                    for pdb in self.structures
+                    for pdb in chunk
                 ]
             else:
                 chain_selections = None