feat(IPVC-2471): add codon_table to txinfo file (#42)

biocommons · May 23, 2024 · e8c811c · e8c811c
1 parent 8b6e4d9
commit e8c811c
Show file tree

Hide file tree

Showing 8 changed files with 54 additions and 13 deletions.
diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py
@@ -341,6 +341,7 @@ def main(ncbi_accession: str, output_dir: str) -> None:
                     mg.gene_symbol,
                     mg.cds_se_i(),
                     mg.exons_se_i(),
+                    mg.transl_table,
                     TxInfo.serialize_transl_except(mg.transl_except),
                 )
             )

diff --git a/src/alembic/versions/a697b584f699_add_codon_table_to_transcript.py b/src/alembic/versions/a697b584f699_add_codon_table_to_transcript.py
@@ -20,7 +20,10 @@
 
 def upgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.add_column('transcript', sa.Column('codon_table', sa.Integer(), server_default='1', nullable=False), schema='uta')
+    op.add_column('transcript', sa.Column('codon_table', sa.Text(), nullable=True), schema='uta')
+    # ### end Alembic commands ###
+    # ### population of codon_table column with data ###
+    op.execute("UPDATE transcript SET codon_table = '1' WHERE cds_start_i NOTNULL;")
     # ### end Alembic commands ###
 
 

diff --git a/src/uta/formats/txinfo.py b/src/uta/formats/txinfo.py
@@ -8,7 +8,7 @@
 class TxInfo(
     recordtype.recordtype(
         'TxInfo',
-        ['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i', 'transl_except'],
+        ['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i', 'codon_table', 'transl_except'],
 )):
 
     @staticmethod

diff --git a/src/uta/loading.py b/src/uta/loading.py
@@ -707,8 +707,10 @@ def _fetch_origin_by_name(name):
 
         if ti.cds_se_i:
             cds_start_i, cds_end_i = map(int, ti.cds_se_i.split(","))
+            codon_table = ti.codon_table
         else:
             cds_start_i = cds_end_i = None
+            codon_table = None
             cds_md5 = None
 
         # 1. Fetch or make the Transcript record
@@ -776,6 +778,7 @@ def _fetch_origin_by_name(name):
                 cds_start_i=cds_start_i,
                 cds_end_i=cds_end_i,
                 cds_md5=cds_md5,
+                codon_table=codon_table,
             )
             session.add(u_tx)
 

diff --git a/src/uta/parsers/seqrecord.py b/src/uta/parsers/seqrecord.py
@@ -168,6 +168,15 @@ def exons_se_i(self):
             se_i = [(f.location.start.real, f.location.end.real) for f in exons]
         return se_i
 
+    @property
+    def codon_table(self) -> Optional[str]:
+        if self.cds_feature is None:
+            return None
+        else:
+            # default codon table is the standard table, aka "1"
+            # https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
+            return "1"
+
     @property
     def transl_except(self) -> Optional[List[str]]:
         if self.cds_feature is None:

diff --git a/tests/data/txinfo.gz b/tests/data/txinfo.gz
diff --git a/tests/test_uta_loading.py b/tests/test_uta_loading.py
@@ -178,8 +178,19 @@ def test_load_txinfo(self):
             type='protein-coding',
             xrefs='MIM:610918,HGNC:HGNC:30397,Ensembl:ENSG00000198832,AllianceGenome:HGNC:30397',
         )
+        g2 = usam.Gene(
+            gene_id='4514',
+            hgnc='MT-CO3',
+            symbol='MT-CO3',
+            maploc=None,
+            descr='mitochondrially encoded cytochrome c oxidase III',
+            summary='mitochondrially encoded cytochrome c oxidase III',
+            aliases='COIII,MTCO3',
+            type='protein-coding',
+            xrefs='GeneID:4514,HGNC:HGNC:7422,MIM:516050',
+        )
         self.session.add(g1)
-
+        self.session.add(g2)
         self.session.commit()
 
         cf = configparser.ConfigParser()
@@ -207,6 +218,24 @@ def test_load_txinfo(self):
             },
         )
 
+        transcript = self.session.query(usam.Transcript).filter(usam.Transcript.ac == 'NC_012920.1_09206_09990').one()
+        self.assertEqual(
+            {
+                'ac': transcript.ac,
+                'gene_id': transcript.gene_id,
+                'cds_start_i': transcript.cds_start_i,
+                'cds_end_i': transcript.cds_end_i,
+                'codon_table': transcript.codon_table,
+            },
+            {
+                'ac': 'NC_012920.1_09206_09990',
+                'gene_id': '4514',
+                'cds_start_i': 0,
+                'cds_end_i': 784,
+                'codon_table': 2,
+            },
+        )
+
         exon_set = self.session.query(usam.ExonSet).filter(usam.ExonSet.tx_ac == 'NM_080430.4').one()
         exons = self.session.query(usam.Exon).filter(usam.Exon.exon_set_id == exon_set.exon_set_id).all()
         self.assertEqual(len(exons), 5)

diff --git a/tests/test_uta_parsers_seqrecord.py b/tests/test_uta_parsers_seqrecord.py
@@ -27,6 +27,7 @@ class TestSeqRecordFacade(unittest.TestCase):
             expected_cds_translation="MAESGRPWAQARSAYRASEVLRRGTGRRRDPGPQSNGPGQEDARAPGRMARLRGQLRAEAASRSEVPRLLKLVERAGAG" \
                 "AAGAGERTGAHSRGSVCSVCGEPRGGATYPAGVLEVSERRLQEGLAAVREELGAGIEALRAELRAELDALRALLPPPPSPPARREPRAVPRAAPRGPTLP" \
                 "RTLGTVSALVAASRPADDAPDGPAECGAHRAPARKNHKKMPVPPGAPQGGGD",
+            expected_codon_table="1",
             expected_exons_se_i=[(0, 696)],
         ),
         param(
@@ -49,6 +50,7 @@ class TestSeqRecordFacade(unittest.TestCase):
                 "EDIDECALPTGGHICSYRCINIPGSFQCSCPSSGYRLAPNGRNCQDIDECVTGIHNCSINETCFNIQGGFRCLAFECPENYRRSAATRCERLPCHENREC" \
                 "SKLPLRITYYHLSFPTNIQAPAVVFRMGPSSAVPGDSMQLAITGGNEEGFFTTRKVSPHSGVVALTKPVPEPRDLLLTVKMDLSRHGTVSSFVAKLFIFV" \
                 "SAEL",
+            expected_codon_table="1",
             expected_exons_se_i=[
                 (0, 182),
                 (182, 288),
@@ -80,6 +82,7 @@ class TestSeqRecordFacade(unittest.TestCase):
             expected_cds_product=None,
             expected_cds_protein_id=None,
             expected_cds_translation=None,
+            expected_codon_table=None,
             expected_exons_se_i=[],
         ),
         param(
@@ -95,6 +98,7 @@ class TestSeqRecordFacade(unittest.TestCase):
             expected_cds_product=None,
             expected_cds_protein_id=None,
             expected_cds_translation=None,
+            expected_codon_table=None,
             expected_exons_se_i=[],
         ),
     ])
@@ -112,6 +116,7 @@ def test_seq_record_facade(
         expected_cds_product,
         expected_cds_protein_id,
         expected_cds_translation,
+        expected_codon_table,
         expected_exons_se_i,
     ):
         gbff_file = os.path.join(self.test_data_dir, file_name)
@@ -127,18 +132,9 @@ def test_seq_record_facade(
         assert self.seq_record_facade.cds_product == expected_cds_product
         assert self.seq_record_facade.cds_protein_id == expected_cds_protein_id
         assert self.seq_record_facade.cds_translation == expected_cds_translation
+        assert self.seq_record_facade.codon_table == expected_codon_table
         assert self.seq_record_facade.exons_se_i == expected_exons_se_i
 
-    # @parameterized.expand([    @parameterized.expand([
-    #     param('no genes', features={}),
-    #     param('no genes', features={'gene': []}),
-    #     param('more than one gene', features={'gene': [Mock(), Mock()]}),
-    #     param('more than one CDS', features={'CDS': [Mock(), Mock()]}),
-    # ])
-    # def test_validate_features_by_type_invalid(self, test_name, features):
-    #     with self.assertRaises(SeqRecordFeatureError):
-    #         SeqRecordFacade.validate_features_by_type(features)
-
     @parameterized.expand([
         param("no gene feature", gene_feature_mock={}),
         param("gene feature is None", gene_feature_mock={"gene": None}),