Support parsing and writing ANARCI CSV format

benjamin-kroeger · Nov 9, 2020 · 4f81002 · 4f81002
1 parent 06bbca5
commit 4f81002
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 8 deletions.
diff --git a/abnumber/chain.py b/abnumber/chain.py
@@ -1,11 +1,11 @@
 from collections import OrderedDict
-from typing import Union
+from typing import Union, List
 
 from Bio import SeqIO
 from Bio.SubsMat import MatrixInfo
 from Bio.SeqRecord import SeqRecord
 import sys
-
+import pandas as pd
 from abnumber.exceptions import ChainParseError
 
 try:
@@ -149,6 +149,54 @@ def to_seq_record(self, keep_tail=False):
         seq = Seq(self.seq + self.tail if keep_tail else self.seq)
         return SeqRecord(seq, id=self.name, description='')
 
+    @classmethod
+    def to_anarci_csv(cls, chains: List['Chain'], path):
+        df = cls.to_dataframe(chains)
+        df.to_csv(path)
+
+    @classmethod
+    def to_dataframe(cls, chains: List['Chain']):
+        df = pd.DataFrame([chain.to_series() for chain in chains]).fillna('-')
+        df.index.name = 'Id'
+
+        # Each chain can have a different set of positions
+        # so we need to sort the columns to make sure they are in the right order
+        # this is using the correct Position sorting
+        prop_columns = [c for c in df.columns if not isinstance(c, Position)]
+        position_columns = sorted([c for c in df.columns if isinstance(c, Position)])
+        df = df[prop_columns + position_columns]
+
+        # Finally convert position columns to string
+        df.columns = df.columns.map(lambda pos: pos.format(chain_type=False))
+
+        return df
+
+    def to_series(self):
+        props = {
+            'chain_type': self.chain_type
+        }
+        return pd.Series({**props, **self.positions}, name=self.name)
+
+    @classmethod
+    def from_series(cls, series, scheme) -> 'Chain':
+        chain_type = series['chain_type']
+        position_index = [c for c in series.index if c[:1].isnumeric()]
+        aa_dict = {Position.from_string(pos, chain_type=chain_type, scheme=scheme): aa
+                   for pos, aa in series[position_index].items() if aa != '-' and not pd.isna(aa)}
+        return cls(sequence=None, aa_dict=aa_dict, name=series.name, scheme=scheme, chain_type=chain_type)
+
+    @classmethod
+    def from_anarci_csv(cls, path, scheme, as_series=False) -> Union[List['Chain'], pd.Series]:
+        df = pd.read_csv(path, index_col=0)
+        return cls.from_dataframe(df, scheme=scheme, as_series=as_series)
+
+    @classmethod
+    def from_dataframe(cls, df, scheme, as_series=False) -> Union[List['Chain'], pd.Series]:
+        chains = [cls.from_series(series, scheme=scheme) for i, series in df.iterrows()]
+        if as_series:
+            return pd.Series(chains, index=[c.name for c in chains])
+        return chains
+
     def format(self, method='wide'):
         """Format sequence to string
 
@@ -697,8 +745,6 @@ def __getitem__(self, item):
         return self.alignment[pos]
 
 
-
-
 class Position:
     """Numbered position using a given numbering scheme
 
@@ -719,12 +765,14 @@ def __repr__(self):
     def __str__(self):
         return self.format()
 
-    def format(self, region=False, rjust=False, ljust=False, fillchar=' '):
-        formatted = f'{self.chain_type_prefix()}{self.number}{self.letter}'
+    def format(self, chain_type=True, region=False, rjust=False, ljust=False, fillchar=' '):
+        formatted = f'{self.number}{self.letter}'
+        if chain_type:
+            formatted = f'{self.chain_type_prefix()}{formatted}'
         if rjust:
-            formatted = formatted.rjust(4, fillchar)
+            formatted = formatted.rjust(4+int(chain_type), fillchar)
         if ljust:
-            formatted = formatted.ljust(4, fillchar)
+            formatted = formatted.ljust(4+int(chain_type), fillchar)
         if region:
             region = self.get_region()
             if rjust:
@@ -753,6 +801,8 @@ def __le__(self, other):
         return self == other or self < other
 
     def __lt__(self, other):
+        if not isinstance(other, Position):
+            raise TypeError(f'Cannot compare Position object with {type(other)}: {other}')
         assert self.chain_type == other.chain_type, f'Positions do not come from the same chain: {self}, {other}'
         assert self.scheme == other.scheme, 'Comparing positions in different schemes is not implemented'
         return self._sort_key() < other._sort_key()

diff --git a/setup.py b/setup.py
@@ -27,6 +27,7 @@
     url='https://github.com/prihoda/abnumber',
     install_requires=[
         "biopython",
+        "pandas"
         # ANARCI dependency specified through Bioconda
     ],
 )