Skip to content

Commit

Permalink
1.5.0 patch3 (a-r-j#189)
Browse files Browse the repository at this point in the history
* refactor PDB download to support parallelisation

* add tests for PDB download

* refactor PDB download function call in getcontacts

* add salt bridges and vdw interactions

* use multiprocess downloading in torch datasets

* add additional edge funcs

* add residue_id to dfs and additional constants

* update tests to account for residue id column

* update dl func to check for obsolete PDBs

* fix type error on paths

* fix obsolete test

* add bad pdb attr

* add return to PDB download util

* Merge master -> Patch (a-r-j#190)

* update bioservices to account for new UniProt API (a-r-j#187)

* update bioservices to account for new UniProt API

* update changelog

* Update PULL_REQUEST_TEMPLATE.md

* Graph plots (a-r-j#186)

* Fix param name typo in function docstring

* add scaling node size by "rsa" feature as well as degree

* add option for scaling node size by meiler embedding dimensions.  Takes negative values to be zero.

* remove walrus operator := for compatability

* Add type hints

* Update changelog

Co-authored-by: Arian Jamasb <[email protected]>

Co-authored-by: Cam <[email protected]>

* patch torch geometric dataset and update changelog

* black, isort

* isort

Co-authored-by: Cam <[email protected]>
  • Loading branch information
a-r-j and kamurani authored Jul 11, 2022
1 parent b7721de commit 553fd26
Show file tree
Hide file tree
Showing 9 changed files with 717 additions and 116 deletions.
15 changes: 14 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
### 1.5.1

#### Changes
#### Protein

* [Feature] - [#186](https://github.com/a-r-j/graphein/pull/186) adds support for scaling node sizes in plots by a computed feature. Contribution by @cimranm
* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) adds support for parallelised download from the PDB.
* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) adds support for: van der waals interactions, vdw clashes, pi-stacking interactions, t_stacking interactions, backbone carbonyl-carbonyl interactions, salt bridges
* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) adds a `residue_id` column to PDB dfs to enable easier accounting in atom graphs.
* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) refactors torch geometric datasets to use parallelised download for faster dataset preparation.

#### Bugfixes

* [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes.
* [Patch] - [#189](https://github.com/a-r-j/graphein/pull/189) fixes bug where chains and PDB identifiers were not properly aligned in `ml.ProteinGraphDataset`.

#### Breaking Changes

* [#189](https://github.com/a-r-j/graphein/pull/189/) refactors PDB download util. Now returns path to download file, does not accept a config object but instead receives the output directory path directly.


### 1.5.0

Expand Down
48 changes: 44 additions & 4 deletions graphein/ml/datasets/torch_geometric_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
from graphein.ml.conversion import GraphFormatConvertor
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.graphs import construct_graphs_mp
from graphein.protein.utils import download_alphafold_structure, download_pdb
from graphein.protein.utils import (
download_alphafold_structure,
download_pdb,
download_pdb_multiprocessing,
)
from graphein.utils.utils import import_message

try:
Expand Down Expand Up @@ -138,6 +142,9 @@ def __init__(
elif self.uniprot_ids:
self.structures = uniprot_ids
self.af_version = af_version
self.bad_pdbs: List[
str
] = [] # list of pdb codes that failed to download

# Labels & Chains
self.graph_label_map = graph_label_map
Expand Down Expand Up @@ -173,7 +180,23 @@ def download(self):
"""Download the PDB files from RCSB or Alphafold."""
self.config.pdb_dir = Path(self.raw_dir)
if self.pdb_codes:
[download_pdb(self.config, pdb) for pdb in tqdm(self.pdb_codes)]
# Only download PDBs that are not already downloaded
to_download = [
pdb
for pdb in set(self.pdb_codes)
if not os.path.exists(Path(self.raw_dir) / f"{pdb}.pdb")
]
download_pdb_multiprocessing(
to_download,
self.raw_dir,
max_workers=self.num_cores,
strict=False,
)
self.bad_pdbs = self.bad_pdbs + [
pdb
for pdb in set(self.pdb_codes)
if not os.path.exists(Path(self.raw_dir) / f"{pdb}.pdb")
]
if self.uniprot_ids:
[
download_alphafold_structure(
Expand Down Expand Up @@ -375,6 +398,7 @@ def __init__(
self.graph_label_map = graph_label_map
self.node_label_map = node_label_map
self.chain_selection_map = chain_selection_map
self.bad_pdbs: List[str] = []

# Configs
self.config = graphein_config
Expand Down Expand Up @@ -404,7 +428,23 @@ def download(self):
"""Download the PDB files from RCSB or Alphafold."""
self.config.pdb_dir = Path(self.raw_dir)
if self.pdb_codes:
[download_pdb(self.config, pdb) for pdb in tqdm(self.pdb_codes)]
# Only download undownloaded PDBs
to_download = [
pdb
for pdb in set(self.pdb_codes)
if not os.path.exists(Path(self.raw_dir) / f"{pdb}.pdb")
]
download_pdb_multiprocessing(
to_download,
self.raw_dir,
max_workers=self.num_cores,
strict=False,
)
self.bad_pdbs = self.bad_pdbs + [
pdb
for pdb in set(self.pdb_codes)
if not os.path.exists(Path(self.raw_dir) / f"{pdb}.pdb")
]
if self.uniprot_ids:
[
download_alphafold_structure(
Expand Down Expand Up @@ -458,7 +498,7 @@ def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]:
self.chain_selection_map[pdb]
if pdb in self.chain_selection_map.keys()
else "all"
for pdb in self.structures
for pdb in chunk
]
else:
chain_selections = None
Expand Down
Loading

0 comments on commit 553fd26

Please sign in to comment.