[Protein] add vector node features (a-r-j#124)

* [Protein] add vector node features. * update changelog * minor fix to vector calc & visualisation of vector features * update changelog * [tests] for geometric features * [tutorials] add vector features to resiude graph notebook * [docs] add vector features to docs * [protein] fix geometry calc * update changelog with breaking changes
kamurani · Mar 9, 2022 · fb9836c · fb9836c
1 parent 9071090
commit fb9836c
Show file tree

Hide file tree

Showing 9 changed files with 21,926 additions and 3,433 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+### 1.2.1 - UNRELEASED
+
+* [Feature] - #124 adds support for vector features associated protein protein geometry. #120 #122
+* [Feature] - #124 adds visualisation of vector features in 3D graph plots.
+
+#### Breaking Changes
+
+* #124 refactors `graphein.protein.graphs.compute_rgroup_dataframe` and moves it to `graphein.protein.utils`. All internal references have been moved accordingly.
+
 ### 1.2.0 - 4/3/2022
 
 * [Feature] - #104 adds support for asteroid plots and distance matrix visualisation.

diff --git a/docs/source/modules/graphein.protein.rst b/docs/source/modules/graphein.protein.rst
@@ -39,6 +39,8 @@ Node
     :members:
 .. automodule:: graphein.protein.features.nodes.dssp
     :members:
+.. automodule:: graphein.protein.features.nodes.geometry
+    :members:
 
 Sequence
 ^^^^^^^^^

diff --git a/graphein/protein/features/nodes/__init__.py b/graphein/protein/features/nodes/__init__.py
@@ -1,2 +1,3 @@
 from .amino_acid import *
 from .dssp import *
+from .geometry import *
diff --git a/graphein/protein/features/nodes/geometry.py b/graphein/protein/features/nodes/geometry.py
@@ -0,0 +1,169 @@
+"""Provides geometry-based featurisation functions."""
+# Graphein
+# Author: Arian Jamasb <[email protected]>
+# License: MIT
+# Project Website: https://github.com/a-r-j/graphein
+# Code Repository: https://github.com/a-r-j/graphein
+import logging
+
+import networkx as nx
+import numpy as np
+import pandas as pd
+
+from graphein.protein.utils import filter_dataframe
+
+
+def add_sidechain_vector(
+    g: nx.Graph, scale: bool = True, reverse: bool = False
+):
+    """Adds vector from node to average position of sidechain atoms.
+
+    We compute the mean of the sidechain atoms for each node. For this we use the ``rgroup_df`` dataframe.
+    If the graph does not contain the ``rgroup_df`` dataframe, we compute it from the ``raw_pdb_df``.
+    If scale, we scale the vector to the unit vector. If reverse is True,
+    we reverse the vector (``sidechain - node``). If reverse is false (default) we compute (``node - sidechain``).
+
+    :param g: Graph to add vector to.
+    :type g: nx.Graph
+    :param scale: Scale vector to unit vector. Defaults to ``True``.
+    :type scale: bool
+    :param reverse: Reverse vector. Defaults to ``False``.
+    :type reverse: bool
+    """
+    # Get or compute R-Group DF
+    if "rgroup_df" not in g.graph.keys():
+        g.graph["rgroup_df"] = compute_rgroup_dataframe(g.graph["raw_pdb_df"])
+
+    sc_centroid = g.graph["rgroup_df"].groupby("node_id").mean()
+
+    # Iterate over nodes and compute vector
+    for n, d in g.nodes(data=True):
+        if d["residue_name"] == "GLY":
+            # If GLY, set vector to 0
+            vec = np.array([0, 0, 0])
+        else:
+            if reverse:
+                vec = d["coords"] - np.array(
+                    sc_centroid.loc[n][["x_coord", "y_coord", "z_coord"]]
+                )
+            else:
+                vec = (
+                    np.array(
+                        sc_centroid.loc[n][["x_coord", "y_coord", "z_coord"]]
+                    )
+                    - d["coords"]
+                )
+
+            if scale:
+                vec = vec / np.linalg.norm(vec)
+
+        d["sidechain_vector"] = vec
+
+
+def add_beta_carbon_vector(
+    g: nx.Graph, scale: bool = True, reverse: bool = False
+):
+    """Adds vector from node (typically alpha carbon) to position of beta carbon.
+
+    Glycine does not have a beta carbon, so we set it to ``np.array([0, 0, 0])``.
+    We extract the position of the beta carbon from the unprocessed atomic PDB dataframe.
+    For this we use the ``raw_pdb_df`` dataframe.
+    If scale, we scale the vector to the unit vector. If reverse is True,
+    we reverse the vector (``C beta - node``). If reverse is false (default) we compute (``node - C beta``).
+
+    :param g: Graph to add vector to.
+    :type g: nx.Graph
+    :param scale: Scale vector to unit vector. Defaults to ``True``.
+    :type scale: bool
+    :param reverse: Reverse vector. Defaults to ``False``.
+    :type reverse: bool
+    """
+
+    c_beta_coords = filter_dataframe(
+        g.graph["raw_pdb_df"], "atom_name", ["CB"], boolean=True
+    )
+    c_beta_coords.index = c_beta_coords["node_id"]
+
+    # Iterate over nodes and compute vector
+    for n, d in g.nodes(data=True):
+        if d["residue_name"] == "GLY":
+            vec = np.array([0, 0, 0])
+        else:
+            if reverse:
+                vec = d["coords"] - np.array(
+                    c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]]
+                )
+            else:
+                vec = (
+                    np.array(
+                        c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]]
+                    )
+                    - d["coords"]
+                )
+
+            if scale:
+                vec = vec / np.linalg.norm(vec)
+        d["c_beta_vector"] = vec
+
+
+def add_sequence_neighbour_vector(
+    g: nx.Graph, scale: bool = True, reverse: bool = False, n_to_c: bool = True
+):
+    """Computes vector from node to adjacent node in sequence.
+    Typically used with ``CA`` (alpha carbon) graphs.
+
+    If ``n_to_c`` is ``True`` (default), we compute the vectors from the N terminus to the C terminus (canonical direction).
+    If ``reverse`` is ``False`` (default), we compute ``Node_i - Node_{i+1}``.
+    If ``reverse is ``True``, we compute ``Node_{i+1} - Node_i``.
+    :param g: Graph to add vector to.
+    :type g: nx.Graph
+    :param scale: Scale vector to unit vector. Defaults to ``True``.
+    :type scale: bool
+    :param reverse: Reverse vector. Defaults to ``False``.
+    :type reverse: bool
+    :param n_to_c: Compute vector from N to C or C to N. Defaults to ``True``.
+    :type n_to_c: bool
+    """
+    suffix = "n_to_c" if n_to_c else "c_to_n"
+    # Iterate over every chain
+    for chain_id in g.graph["chain_ids"]:
+
+        # Find chain residues
+        chain_residues = [
+            (n, v) for n, v in g.nodes(data=True) if v["chain_id"] == chain_id
+        ]
+
+        if not n_to_c:
+            chain_residues.reverse()
+
+        # Iterate over every residue in chain
+        for i, residue in enumerate(chain_residues):
+            # Checks not at chain terminus - is this versatile enough?
+            if i == len(chain_residues) - 1:
+                residue[1][f"sequence_neighbour_vector_{suffix}"] = np.array(
+                    [0, 0, 0]
+                )
+                continue
+            # Asserts residues are on the same chain
+            cond_1 = (
+                residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"]
+            )
+            # Asserts residue numbers are adjacent
+            cond_2 = (
+                abs(
+                    residue[1]["residue_number"]
+                    - chain_residues[i + 1][1]["residue_number"]
+                )
+                == 1
+            )
+
+            # If this checks out, we compute the vector
+            if (cond_1) and (cond_2):
+                vec = chain_residues[i + 1][1]["coords"] - residue[1]["coords"]
+
+                if reverse:
+                    vec = -vec
+                if scale:
+                    vec = vec / np.linalg.norm(vec)
+
+            residue[1][f"sequence_neighbour_vector_{suffix}"] = vec
diff --git a/graphein/protein/graphs.py b/graphein/protein/graphs.py
@@ -24,6 +24,7 @@
 from graphein.protein.edges.distance import compute_distmat
 from graphein.protein.resi_atoms import BACKBONE_ATOMS
 from graphein.protein.utils import (
+    compute_rgroup_dataframe,
     filter_dataframe,
     get_protein_name_from_filename,
     three_to_one_with_mods,
@@ -189,17 +190,6 @@ def filter_hetatms(
     return [df.loc[df["residue_name"] == hetatm] for hetatm in keep_hets]
 
 
-def compute_rgroup_dataframe(pdb_df: pd.DataFrame) -> pd.DataFrame:
-    """Return the atoms that are in R-groups and not the backbone chain.
-
-    :param pdb_df: DataFrame to compute R group dataframe from.
-    :type pdb_df: pd.DataFrame
-    :returns: Dataframe containing R-groups only (backbone atoms removed).
-    :rtype: pd.DataFrame
-    """
-    return filter_dataframe(pdb_df, "atom_name", BACKBONE_ATOMS, False)
-
-
 def process_dataframe(
     protein_df: pd.DataFrame,
     atom_df_processing_funcs: Optional[List[Callable]] = None,

diff --git a/graphein/protein/utils.py b/graphein/protein/utils.py
@@ -16,7 +16,7 @@
 import wget
 from Bio.PDB import PDBList
 
-from .resi_atoms import RESI_THREE_TO_1
+from .resi_atoms import BACKBONE_ATOMS, RESI_THREE_TO_1
 
 log = logging.getLogger(__name__)
 
@@ -105,6 +105,17 @@ def filter_dataframe(
     return df
 
 
+def compute_rgroup_dataframe(pdb_df: pd.DataFrame) -> pd.DataFrame:
+    """Return the atoms that are in R-groups and not the backbone chain.
+
+    :param pdb_df: DataFrame to compute R group dataframe from.
+    :type pdb_df: pd.DataFrame
+    :returns: Dataframe containing R-groups only (backbone atoms removed).
+    :rtype: pd.DataFrame
+    """
+    return filter_dataframe(pdb_df, "atom_name", BACKBONE_ATOMS, False)
+
+
 def download_alphafold_structure(
     uniprot_id: str,
     out_dir: str = ".",

diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py
@@ -406,6 +406,106 @@ def plot_protein_structure_graph(
     return ax
 
 
+def add_vector_to_plot(
+    g: nx.Graph,
+    fig,
+    vector: str = "sidechain_vector",
+    scale: float = 5,
+    colour: str = "red",
+    width: int = 10,
+) -> go.Figure:
+    """Adds representations of vector features to the protein graph.
+
+    Requires all nodes have a vector feature (1 x 3 array).
+
+    :param g: Protein graph containing vector features
+    :type g: nx.Graph
+    :param fig: 3D plotly figure to add vectors to.
+    :type fig: go.Figure
+    :param vector: Name of node vector feature to add, defaults to "sidechain_vector"
+    :type vector: str, optional
+    :param scale: How much to scale the vectors by, defaults to 5
+    :type scale: float, optional
+    :param colour: Colours for vectors, defaults to "red"
+    :type colour: str, optional
+    :return: 3D Plotly plot with vectors added.
+    :rtype: go.Figure
+    """
+    # Compute line segment positions
+    x_edges = []
+    y_edges = []
+    z_edges = []
+    edge_text = []
+    for _, d in g.nodes(data=True):
+        x_edges.extend(
+            [d["coords"][0], d["coords"][0] + d[vector][0] * scale, None]
+        )
+        y_edges.extend(
+            [d["coords"][1], d["coords"][1] + d[vector][1] * scale, None]
+        )
+        z_edges.extend(
+            [d["coords"][2], d["coords"][2] + d[vector][2] * scale, None]
+        )
+        edge_text.extend([None, f"{vector}", None])
+
+    edge_trace = go.Scatter3d(
+        x=x_edges,
+        y=y_edges,
+        z=z_edges,
+        mode="lines",
+        line={"color": colour, "width": width},
+        text=3 * [f"{vector}" for _ in range(len(g))],
+        hoverinfo="text",
+    )
+    # Compute cone positions.
+    arrow_tip_ratio = 0.1
+    arrow_starting_ratio = 0.98
+    x = []
+    y = []
+    z = []
+    u = []
+    v = []
+    w = []
+    for _, d in g.nodes(data=True):
+        x.extend(
+            [d["coords"][0] + d[vector][0] * scale * arrow_starting_ratio]
+        )
+        y.extend(
+            [d["coords"][1] + d[vector][1] * scale * arrow_starting_ratio]
+        )
+        z.extend(
+            [d["coords"][2] + d[vector][2] * scale * arrow_starting_ratio]
+        )
+        u.extend([d[vector][0]])  # * arrow_tip_ratio])
+        v.extend([d[vector][1]])  # * arrow_tip_ratio])
+        w.extend([d[vector][2]])  # * arrow_tip_ratio])
+
+    if colour == "red":
+        colour = [[0, "rgb(255,0,0)"], [1, "rgb(255,0,0)"]]
+    elif colour == "blue":
+        colour = [[0, "rgb(0,0,255)"], [1, "rgb(0,0,255)"]]
+    elif colour == "green":
+        colour = [[0, "rgb(0,255,0)"], [1, "rgb(0,255,0)"]]
+
+    cone_trace = go.Cone(
+        x=x,
+        y=y,
+        z=z,
+        u=u,
+        v=v,
+        w=w,
+        text=[f"{vector}" for _ in range(len(g.nodes()))],
+        hoverinfo="u+v+w+text",
+        colorscale=colour,
+        showlegend=False,
+        showscale=False,
+        sizemode="absolute",
+    )
+    fig.add_trace(edge_trace)
+    fig.add_trace(cone_trace)
+    return fig
+
+
 def plot_distance_matrix(
     g: Optional[nx.Graph],
     dist_mat: Optional[np.ndarray] = None,