Skip to content

Commit

Permalink
[Protein] add vector node features (a-r-j#124)
Browse files Browse the repository at this point in the history
* [Protein] add vector node features.

* update changelog

* minor fix to vector calc & visualisation of vector features

* update changelog

* [tests] for geometric features

* [tutorials] add vector features to resiude graph notebook

* [docs] add vector features to docs

* [protein] fix geometry calc

* update changelog with breaking changes
  • Loading branch information
a-r-j authored Mar 9, 2022
1 parent 9071090 commit fb9836c
Show file tree
Hide file tree
Showing 9 changed files with 21,926 additions and 3,433 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
### 1.2.1 - UNRELEASED

* [Feature] - #124 adds support for vector features associated protein protein geometry. #120 #122
* [Feature] - #124 adds visualisation of vector features in 3D graph plots.

#### Breaking Changes

* #124 refactors `graphein.protein.graphs.compute_rgroup_dataframe` and moves it to `graphein.protein.utils`. All internal references have been moved accordingly.

### 1.2.0 - 4/3/2022

* [Feature] - #104 adds support for asteroid plots and distance matrix visualisation.
Expand Down
2 changes: 2 additions & 0 deletions docs/source/modules/graphein.protein.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ Node
:members:
.. automodule:: graphein.protein.features.nodes.dssp
:members:
.. automodule:: graphein.protein.features.nodes.geometry
:members:

Sequence
^^^^^^^^^
Expand Down
1 change: 1 addition & 0 deletions graphein/protein/features/nodes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .amino_acid import *
from .dssp import *
from .geometry import *
169 changes: 169 additions & 0 deletions graphein/protein/features/nodes/geometry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
"""Provides geometry-based featurisation functions."""
# Graphein
# Author: Arian Jamasb <[email protected]>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import logging

import networkx as nx
import numpy as np
import pandas as pd

from graphein.protein.utils import filter_dataframe


def add_sidechain_vector(
g: nx.Graph, scale: bool = True, reverse: bool = False
):
"""Adds vector from node to average position of sidechain atoms.
We compute the mean of the sidechain atoms for each node. For this we use the ``rgroup_df`` dataframe.
If the graph does not contain the ``rgroup_df`` dataframe, we compute it from the ``raw_pdb_df``.
If scale, we scale the vector to the unit vector. If reverse is True,
we reverse the vector (``sidechain - node``). If reverse is false (default) we compute (``node - sidechain``).
:param g: Graph to add vector to.
:type g: nx.Graph
:param scale: Scale vector to unit vector. Defaults to ``True``.
:type scale: bool
:param reverse: Reverse vector. Defaults to ``False``.
:type reverse: bool
"""
# Get or compute R-Group DF
if "rgroup_df" not in g.graph.keys():
g.graph["rgroup_df"] = compute_rgroup_dataframe(g.graph["raw_pdb_df"])

sc_centroid = g.graph["rgroup_df"].groupby("node_id").mean()

# Iterate over nodes and compute vector
for n, d in g.nodes(data=True):
if d["residue_name"] == "GLY":
# If GLY, set vector to 0
vec = np.array([0, 0, 0])
else:
if reverse:
vec = d["coords"] - np.array(
sc_centroid.loc[n][["x_coord", "y_coord", "z_coord"]]
)
else:
vec = (
np.array(
sc_centroid.loc[n][["x_coord", "y_coord", "z_coord"]]
)
- d["coords"]
)

if scale:
vec = vec / np.linalg.norm(vec)

d["sidechain_vector"] = vec


def add_beta_carbon_vector(
g: nx.Graph, scale: bool = True, reverse: bool = False
):
"""Adds vector from node (typically alpha carbon) to position of beta carbon.
Glycine does not have a beta carbon, so we set it to ``np.array([0, 0, 0])``.
We extract the position of the beta carbon from the unprocessed atomic PDB dataframe.
For this we use the ``raw_pdb_df`` dataframe.
If scale, we scale the vector to the unit vector. If reverse is True,
we reverse the vector (``C beta - node``). If reverse is false (default) we compute (``node - C beta``).
:param g: Graph to add vector to.
:type g: nx.Graph
:param scale: Scale vector to unit vector. Defaults to ``True``.
:type scale: bool
:param reverse: Reverse vector. Defaults to ``False``.
:type reverse: bool
"""

c_beta_coords = filter_dataframe(
g.graph["raw_pdb_df"], "atom_name", ["CB"], boolean=True
)
c_beta_coords.index = c_beta_coords["node_id"]

# Iterate over nodes and compute vector
for n, d in g.nodes(data=True):
if d["residue_name"] == "GLY":
vec = np.array([0, 0, 0])
else:
if reverse:
vec = d["coords"] - np.array(
c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]]
)
else:
vec = (
np.array(
c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]]
)
- d["coords"]
)

if scale:
vec = vec / np.linalg.norm(vec)
d["c_beta_vector"] = vec


def add_sequence_neighbour_vector(
g: nx.Graph, scale: bool = True, reverse: bool = False, n_to_c: bool = True
):
"""Computes vector from node to adjacent node in sequence.
Typically used with ``CA`` (alpha carbon) graphs.
If ``n_to_c`` is ``True`` (default), we compute the vectors from the N terminus to the C terminus (canonical direction).
If ``reverse`` is ``False`` (default), we compute ``Node_i - Node_{i+1}``.
If ``reverse is ``True``, we compute ``Node_{i+1} - Node_i``.
:param g: Graph to add vector to.
:type g: nx.Graph
:param scale: Scale vector to unit vector. Defaults to ``True``.
:type scale: bool
:param reverse: Reverse vector. Defaults to ``False``.
:type reverse: bool
:param n_to_c: Compute vector from N to C or C to N. Defaults to ``True``.
:type n_to_c: bool
"""
suffix = "n_to_c" if n_to_c else "c_to_n"
# Iterate over every chain
for chain_id in g.graph["chain_ids"]:

# Find chain residues
chain_residues = [
(n, v) for n, v in g.nodes(data=True) if v["chain_id"] == chain_id
]

if not n_to_c:
chain_residues.reverse()

# Iterate over every residue in chain
for i, residue in enumerate(chain_residues):
# Checks not at chain terminus - is this versatile enough?
if i == len(chain_residues) - 1:
residue[1][f"sequence_neighbour_vector_{suffix}"] = np.array(
[0, 0, 0]
)
continue
# Asserts residues are on the same chain
cond_1 = (
residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"]
)
# Asserts residue numbers are adjacent
cond_2 = (
abs(
residue[1]["residue_number"]
- chain_residues[i + 1][1]["residue_number"]
)
== 1
)

# If this checks out, we compute the vector
if (cond_1) and (cond_2):
vec = chain_residues[i + 1][1]["coords"] - residue[1]["coords"]

if reverse:
vec = -vec
if scale:
vec = vec / np.linalg.norm(vec)

residue[1][f"sequence_neighbour_vector_{suffix}"] = vec
12 changes: 1 addition & 11 deletions graphein/protein/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from graphein.protein.edges.distance import compute_distmat
from graphein.protein.resi_atoms import BACKBONE_ATOMS
from graphein.protein.utils import (
compute_rgroup_dataframe,
filter_dataframe,
get_protein_name_from_filename,
three_to_one_with_mods,
Expand Down Expand Up @@ -189,17 +190,6 @@ def filter_hetatms(
return [df.loc[df["residue_name"] == hetatm] for hetatm in keep_hets]


def compute_rgroup_dataframe(pdb_df: pd.DataFrame) -> pd.DataFrame:
"""Return the atoms that are in R-groups and not the backbone chain.
:param pdb_df: DataFrame to compute R group dataframe from.
:type pdb_df: pd.DataFrame
:returns: Dataframe containing R-groups only (backbone atoms removed).
:rtype: pd.DataFrame
"""
return filter_dataframe(pdb_df, "atom_name", BACKBONE_ATOMS, False)


def process_dataframe(
protein_df: pd.DataFrame,
atom_df_processing_funcs: Optional[List[Callable]] = None,
Expand Down
13 changes: 12 additions & 1 deletion graphein/protein/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import wget
from Bio.PDB import PDBList

from .resi_atoms import RESI_THREE_TO_1
from .resi_atoms import BACKBONE_ATOMS, RESI_THREE_TO_1

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -105,6 +105,17 @@ def filter_dataframe(
return df


def compute_rgroup_dataframe(pdb_df: pd.DataFrame) -> pd.DataFrame:
"""Return the atoms that are in R-groups and not the backbone chain.
:param pdb_df: DataFrame to compute R group dataframe from.
:type pdb_df: pd.DataFrame
:returns: Dataframe containing R-groups only (backbone atoms removed).
:rtype: pd.DataFrame
"""
return filter_dataframe(pdb_df, "atom_name", BACKBONE_ATOMS, False)


def download_alphafold_structure(
uniprot_id: str,
out_dir: str = ".",
Expand Down
100 changes: 100 additions & 0 deletions graphein/protein/visualisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,106 @@ def plot_protein_structure_graph(
return ax


def add_vector_to_plot(
g: nx.Graph,
fig,
vector: str = "sidechain_vector",
scale: float = 5,
colour: str = "red",
width: int = 10,
) -> go.Figure:
"""Adds representations of vector features to the protein graph.
Requires all nodes have a vector feature (1 x 3 array).
:param g: Protein graph containing vector features
:type g: nx.Graph
:param fig: 3D plotly figure to add vectors to.
:type fig: go.Figure
:param vector: Name of node vector feature to add, defaults to "sidechain_vector"
:type vector: str, optional
:param scale: How much to scale the vectors by, defaults to 5
:type scale: float, optional
:param colour: Colours for vectors, defaults to "red"
:type colour: str, optional
:return: 3D Plotly plot with vectors added.
:rtype: go.Figure
"""
# Compute line segment positions
x_edges = []
y_edges = []
z_edges = []
edge_text = []
for _, d in g.nodes(data=True):
x_edges.extend(
[d["coords"][0], d["coords"][0] + d[vector][0] * scale, None]
)
y_edges.extend(
[d["coords"][1], d["coords"][1] + d[vector][1] * scale, None]
)
z_edges.extend(
[d["coords"][2], d["coords"][2] + d[vector][2] * scale, None]
)
edge_text.extend([None, f"{vector}", None])

edge_trace = go.Scatter3d(
x=x_edges,
y=y_edges,
z=z_edges,
mode="lines",
line={"color": colour, "width": width},
text=3 * [f"{vector}" for _ in range(len(g))],
hoverinfo="text",
)
# Compute cone positions.
arrow_tip_ratio = 0.1
arrow_starting_ratio = 0.98
x = []
y = []
z = []
u = []
v = []
w = []
for _, d in g.nodes(data=True):
x.extend(
[d["coords"][0] + d[vector][0] * scale * arrow_starting_ratio]
)
y.extend(
[d["coords"][1] + d[vector][1] * scale * arrow_starting_ratio]
)
z.extend(
[d["coords"][2] + d[vector][2] * scale * arrow_starting_ratio]
)
u.extend([d[vector][0]]) # * arrow_tip_ratio])
v.extend([d[vector][1]]) # * arrow_tip_ratio])
w.extend([d[vector][2]]) # * arrow_tip_ratio])

if colour == "red":
colour = [[0, "rgb(255,0,0)"], [1, "rgb(255,0,0)"]]
elif colour == "blue":
colour = [[0, "rgb(0,0,255)"], [1, "rgb(0,0,255)"]]
elif colour == "green":
colour = [[0, "rgb(0,255,0)"], [1, "rgb(0,255,0)"]]

cone_trace = go.Cone(
x=x,
y=y,
z=z,
u=u,
v=v,
w=w,
text=[f"{vector}" for _ in range(len(g.nodes()))],
hoverinfo="u+v+w+text",
colorscale=colour,
showlegend=False,
showscale=False,
sizemode="absolute",
)
fig.add_trace(edge_trace)
fig.add_trace(cone_trace)
return fig


def plot_distance_matrix(
g: Optional[nx.Graph],
dist_mat: Optional[np.ndarray] = None,
Expand Down
Loading

0 comments on commit fb9836c

Please sign in to comment.