Add first working version

benjamin-kroeger · Oct 12, 2020 · 805eacc · 805eacc
commit 805eacc
Show file tree

Hide file tree

Showing 9 changed files with 435 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+.idea
+*.pyc
+__pycache__
+/dist
+/build
+/work
+*.egg-info
+.pytest_cache
+
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,19 @@
+The MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,27 @@
+.PHONY: install release dist test
+
+install:
+	pip install -e .
+
+release:
+ifndef VERSION
+	$(error "Usage: make release VERSION=0.1.1")
+endif
+	git checkout master
+	git pull
+	echo "__version__ = '$(VERSION)'" > abnumber/__version__.py
+	git add abnumber/__version__.py
+	git commit -m "Set version to $(VERSION)"
+	git push
+	make dist
+	twine upload dist/abnumber-$(VERSION)*
+	git checkout develop
+	git pull
+	git rebase origin/master
+	@echo "Create a new release version on: https://github.com/prihoda/abnumber/releases"
+
+dist:
+	python setup.py sdist bdist_wheel
+
+test:
+	pytest test
diff --git a/README.md b/README.md
@@ -0,0 +1,23 @@
+# AbNumber
+
+Convenience Python APIs for antibody numbering using [ANARCI](https://github.com/oxpig/ANARCI)
+
+## Installation
+
+Install using Bioconda:
+```bash
+conda install -c bioconda abnumber
+```
+
+## Usage
+
+```python
+from abnumber import Chain
+
+chain = Chain.from_str('ELVMTQSPSSLSASVGDRVNIACRASQGISSALAWYQQKPGKAPRLLIYDASNLESGVPSRFSGSGSGTDFTLTISSLQPEDFAIYYCQQFNSYPLTFGGGTKVEIKRTV')
+print(chain)
+```
+
+## Credits
+
+See [ANARCI on GitHub](https://github.com/oxpig/ANARCI) and the ANARCI paper: [ANARCI: antigen receptor numbering and receptor classification](https://doi.org/10.1093/bioinformatics/btv552)
diff --git a/abnumber/__init__.py b/abnumber/__init__.py
@@ -0,0 +1,2 @@
+from abnumber.__version__ import __version__
+from abnumber.chain import Chain
diff --git a/abnumber/__version__.py b/abnumber/__version__.py
@@ -0,0 +1 @@
+__version__ = '0.1.1'
diff --git a/abnumber/chain.py b/abnumber/chain.py
@@ -0,0 +1,303 @@
+from collections import OrderedDict
+from Bio.SubsMat import MatrixInfo
+from anarci.anarci import anarci
+from termcolor import colored as colored_fn
+
+
+class Chain:
+
+    def __init__(self, aa_dict, name=None):
+        self.name = name
+        self.fw1 = OrderedDict()
+        self.cdr1 = OrderedDict()
+        self.fw2 = OrderedDict()
+        self.cdr2 = OrderedDict()
+        self.fw3 = OrderedDict()
+        self.cdr3 = OrderedDict()
+        self.fw4 = OrderedDict()
+
+        regions_list = [self.fw1, self.cdr1, self.fw2, self.cdr2, self.fw3, self.cdr3, self.fw4]
+        region = 0
+        for pos in sorted(aa_dict.keys()):
+            while pos.number >= IMGT_BORDERS[region]:
+                region += 1
+            aa = aa_dict[pos].upper()
+            # TODO validate amino acid aa
+            regions_list[region][pos] = aa
+
+    def clone(self, replace_seq=None):
+        aa_dict = {}
+        positions = self.positions
+        if replace_seq is not None:
+            assert len(replace_seq) == len(positions), 'Sequence needs to be the same length'
+        for i, (pos, aa) in enumerate(positions.items()):
+            aa_dict[pos] = replace_seq[i] if replace_seq is not None else aa
+        return Chain(aa_dict=aa_dict, name=self.name)
+
+    def __repr__(self):
+        return self.seq
+
+    def get(self, position, default=None):
+        for region, aa_dict in self.regions.items():
+            if position in aa_dict:
+                return aa_dict[position]
+        return default
+
+    def __getitem__(self, position):
+        aa = self.get(position)
+        if aa is None:
+            raise IndexError(f'Position "{position}" not found in chain')
+        return aa
+
+    def format(self, method='wide'):
+        if method == 'wide':
+            return self.to_wide_string()
+        elif method == 'tall':
+            return self.to_tall_string()
+        raise ValueError(f'Use method="wide" or method="tall", unknown method: "{method}"')
+
+    def to_tall_string(self):
+        seq = []
+        for region, aa_dict in self.regions.items():
+            for pos, aa in aa_dict.items():
+                seq.append(f'{region: >4} {pos: <4} {aa}')
+        return '\n'.join(seq)
+
+    def to_wide_string(self):
+        annot = ' ' * len(self.fw1)
+        annot += '^' * len(self.cdr1)
+        annot += ' ' * len(self.fw2)
+        annot += '^' * len(self.cdr2)
+        annot += ' ' * len(self.fw3)
+        annot += '^' * len(self.cdr3)
+        annot += ' ' * len(self.fw4)
+        return self.seq + '\n' + annot
+
+    def has_same_cdr_positions(self, other):
+        if len(self.cdr1) != len(other.cdr1) or self.cdr1.keys() != other.cdr1.keys():
+            return False
+        if len(self.cdr2) != len(other.cdr2) or self.cdr2.keys() != other.cdr2.keys():
+            return False
+        if len(self.cdr3) != len(other.cdr3) or self.cdr3.keys() != other.cdr3.keys():
+            return False
+        return True
+
+    def get_fw1_matches(self, other):
+        return sum(aa == other.fw1.get(pos) for pos, aa in self.fw1.items())
+
+    def get_cdr1_matches(self, other):
+        return sum(aa == other.cdr1.get(pos) for pos, aa in self.cdr1.items())
+
+    def get_fw2_matches(self, other):
+        return sum(aa == other.fw2.get(pos) for pos, aa in self.fw2.items())
+
+    def get_cdr2_matches(self, other):
+        return sum(aa == other.cdr2.get(pos) for pos, aa in self.cdr2.items())
+
+    def get_fw3_matches(self, other):
+        return sum(aa == other.fw3.get(pos) for pos, aa in self.fw3.items())
+
+    def get_cdr3_matches(self, other):
+        return sum(aa == other.cdr3.get(pos) for pos, aa in self.cdr3.items())
+
+    def get_fw4_matches(self, other):
+        return sum(aa == other.fw4.get(pos) for pos, aa in self.fw4.items())
+
+    def get_fw_matches(self, other):
+        return self.get_fw1_matches(other) + self.get_fw2_matches(other) + self.get_fw3_matches(
+            other) + self.get_fw4_matches(other)
+
+    def get_cdr_matches(self, other):
+        return self.get_cdr1_matches(other) + self.get_cdr2_matches(other) + self.get_cdr3_matches(other)
+
+    def get_matches(self, other):
+        return self.get_fw_matches(other) + self.get_cdr_matches(other)
+
+    def align(self, *others):
+        pos_dicts = [self.positions]
+        for other in others:
+            pos_dicts.append(other.positions)
+        shared_pos = sorted(set(pos for pos_dict in pos_dicts for pos in pos_dict.keys()))
+        residues = [tuple(pos_dict.get(pos, '-') for pos_dict in pos_dicts) for pos in shared_pos]
+        return Alignment(shared_pos, residues)
+
+    @classmethod
+    def from_str(cls, seq_str, scheme='imgt', allowed_species=None, name=None):
+        # Allowed species: ['human', 'mouse','rat','rabbit','rhesus','pig','alpaca']
+        assert scheme in ['imgt']
+        all_numbered, all_ali, all_hits = anarci([('id', seq_str)], scheme=scheme, allowed_species=allowed_species)
+        # We only have one sequence
+        numbered = all_numbered[0]
+        ali = all_ali[0]
+        hits = all_hits[0]
+        if numbered is None:
+            raise ValueError(f'No alignment found for sequence: "{seq_str}"')
+        if len(numbered) != 1:
+            raise NotImplementedError(f'Unsupported: Multiple ANARCI domains found in sequence: "{seq_str}"')
+        positions, start, end = numbered[0]
+        # FIXME assign chain to cls.chain
+        return cls({Position(chain='?', number=num, letter=letter, scheme=scheme): aa for (num, letter), aa in positions if aa != '-'}, name=name)
+
+    @property
+    def regions(self):
+        return OrderedDict(
+            fw1=self.fw1,
+            cdr1=self.cdr1,
+            fw2=self.fw2,
+            cdr2=self.cdr2,
+            fw3=self.fw3,
+            cdr3=self.cdr3,
+            fw4=self.fw4
+        )
+
+    @property
+    def positions(self):
+        positions = OrderedDict()
+        for region, aa_dict in self.regions.items():
+            for pos, aa in aa_dict.items():
+                positions[pos] = aa
+        return positions
+
+    @property
+    def seq(self):
+        return ''.join(self.positions.values())
+
+    @property
+    def fw1_seq(self):
+        return ''.join(self.fw1.values())
+
+    @property
+    def cdr1_seq(self):
+        return ''.join(self.cdr1.values())
+
+    @property
+    def fw2_seq(self):
+        return ''.join(self.fw2.values())
+
+    @property
+    def cdr2_seq(self):
+        return ''.join(self.cdr2.values())
+
+    @property
+    def fw3_seq(self):
+        return ''.join(self.fw3.values())
+
+    @property
+    def cdr3_seq(self):
+        return ''.join(self.cdr3.values())
+
+    @property
+    def fw4_seq(self):
+        return ''.join(self.fw4.values())
+
+
+class Alignment:
+    def __init__(self, positions, residues):
+        self.positions = positions
+        self.residues = residues
+        self._zipped = list(zip(self.positions, self.residues))
+
+    def __repr__(self):
+        return self.format(colored=False)
+
+    def __iter__(self):
+        yield from self._zipped.__iter__()
+
+    def __getitem__(self, item):
+        return self._zipped.__getitem__(item)
+
+    def format(self, mark_identity=True, mark_cdrs=True, colored=False):
+        seq1 = ''
+        identity = ''
+        seq2 = ''
+        cdrs = ''
+        # TODO support multiple sequence alignment
+        for pos, (a, b) in self:
+            if not colored or a == b:
+                seq1 += a
+                seq2 += b
+            elif is_similar_residue(a, b):
+                seq1 += colored_fn(a, 'white', 'on_yellow', attrs=['bold'])
+                seq2 += colored_fn(b, 'white', 'on_yellow', attrs=['bold'])
+            else:
+                seq1 += colored_fn(a, 'white', 'on_red', attrs=['bold'])
+                seq2 += colored_fn(b, 'white', 'on_red', attrs=['bold'])
+
+            if mark_identity:
+                identity += '|' if a == b else ('+' if is_similar_residue(a, b) else '.')
+            if mark_cdrs:
+                cdrs += '^' if pos.is_in_cdr() else ' '
+        return seq1 + (('\n' + identity) if mark_identity else '') + '\n' + seq2 + (
+            ('\n' + cdrs) if mark_cdrs else '')
+
+
+class Position:
+    def __init__(self, chain, number, letter, scheme):
+        self.chain = chain # TODO type
+        self.number: str = number
+        self.letter: str = letter
+        self.scheme = scheme # TODO type
+
+    def __repr__(self):
+        return f'{self.chain}{self.number}{self.letter}({self.scheme})'
+
+    def __str__(self):
+        return f'{self.chain}{self.number}{self.letter}'
+
+    def __hash__(self):
+        return self.__repr__().__hash__()
+
+    def __eq__(self, other):
+        return self.__repr__() == other.__repr__()
+
+    def __lt__(self, other):
+        assert self.chain == other.chain, f'Positions do not come from the same chain: {self}, {other}'
+        assert self.scheme == other.scheme, 'Comparing positions in different schemes is not implemented'
+        return self.sort_key < other.sort_key
+
+    @property
+    def sort_key(self):
+        if self.scheme == 'imgt':
+            letter_ord = ord(self.letter) if self.letter else 0
+            if self.number == 112:
+                # position 112 is sorted in reverse
+                letter_ord = -letter_ord
+        else:
+            raise NotImplementedError(f'Cannot compare positions of scheme: {self.scheme}')
+        return self.number, letter_ord
+
+    def get_region(self):
+        if self.scheme == 'imgt':
+            return IMGT_POS_DICT[self.number]
+        else:
+            raise NotImplementedError(f'Not supported scheme: {self.scheme}')
+
+    def is_in_cdr(self):
+        # FIXME
+        return self.get_region().lower().startswith('cdr')
+
+
+def is_similar_residue(a, b, matrix=MatrixInfo.blosum62):
+    if a == '-' or b == '-':
+        return a == b
+    pair = (a, b) if (a, b) in matrix else (b, a)
+    return matrix[pair] > 0
+
+
+IMGT_BORDERS = [27, 39, 56, 66, 105, 118, 129]
+
+IMGT_FW1 = list(range(1, IMGT_BORDERS[0]))
+IMGT_CDR1 = list(range(IMGT_BORDERS[0], IMGT_BORDERS[1]))
+IMGT_FW2 = list(range(IMGT_BORDERS[1], IMGT_BORDERS[2]))
+IMGT_CDR2 = list(range(IMGT_BORDERS[2], IMGT_BORDERS[3]))
+IMGT_FW3 = list(range(IMGT_BORDERS[3], IMGT_BORDERS[4]))
+IMGT_CDR3 = list(range(IMGT_BORDERS[4], IMGT_BORDERS[5]))
+IMGT_FW4 = list(range(IMGT_BORDERS[5], IMGT_BORDERS[6]))
+
+IMGT_CDR = IMGT_CDR1 + IMGT_CDR2 + IMGT_CDR3
+IMGT_FW = IMGT_FW1 + IMGT_FW2 + IMGT_FW3 + IMGT_FW4
+
+IMGT_CDR_DICT = {'cdr1': IMGT_CDR1, 'cdr2': IMGT_CDR2, 'cdr3': IMGT_CDR3}
+IMGT_FW_DICT = {'fw1': IMGT_FW1, 'fw2': IMGT_FW2, 'fw3': IMGT_FW3, 'fw4': IMGT_FW4}
+IMGT_REGION_DICT = {**IMGT_CDR_DICT, **IMGT_FW_DICT}
+IMGT_POS_DICT = {pos_num: region for region, positions in IMGT_REGION_DICT.items() for pos_num in positions}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from abnumber.__version__ import __version__
		from abnumber.chain import Chain