Skip to content

Commit

Permalink
feat(IPVC-2471): add codon_table to txinfo file (#42)
Browse files Browse the repository at this point in the history
  • Loading branch information
bsgiles73 authored May 23, 2024
1 parent 8b6e4d9 commit e8c811c
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 13 deletions.
1 change: 1 addition & 0 deletions sbin/ncbi_process_mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ def main(ncbi_accession: str, output_dir: str) -> None:
mg.gene_symbol,
mg.cds_se_i(),
mg.exons_se_i(),
mg.transl_table,
TxInfo.serialize_transl_except(mg.transl_except),
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@

def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('transcript', sa.Column('codon_table', sa.Integer(), server_default='1', nullable=False), schema='uta')
op.add_column('transcript', sa.Column('codon_table', sa.Text(), nullable=True), schema='uta')
# ### end Alembic commands ###
# ### population of codon_table column with data ###
op.execute("UPDATE transcript SET codon_table = '1' WHERE cds_start_i NOTNULL;")
# ### end Alembic commands ###


Expand Down
2 changes: 1 addition & 1 deletion src/uta/formats/txinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class TxInfo(
recordtype.recordtype(
'TxInfo',
['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i', 'transl_except'],
['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i', 'codon_table', 'transl_except'],
)):

@staticmethod
Expand Down
3 changes: 3 additions & 0 deletions src/uta/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,8 +707,10 @@ def _fetch_origin_by_name(name):

if ti.cds_se_i:
cds_start_i, cds_end_i = map(int, ti.cds_se_i.split(","))
codon_table = ti.codon_table
else:
cds_start_i = cds_end_i = None
codon_table = None
cds_md5 = None

# 1. Fetch or make the Transcript record
Expand Down Expand Up @@ -776,6 +778,7 @@ def _fetch_origin_by_name(name):
cds_start_i=cds_start_i,
cds_end_i=cds_end_i,
cds_md5=cds_md5,
codon_table=codon_table,
)
session.add(u_tx)

Expand Down
9 changes: 9 additions & 0 deletions src/uta/parsers/seqrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,15 @@ def exons_se_i(self):
se_i = [(f.location.start.real, f.location.end.real) for f in exons]
return se_i

@property
def codon_table(self) -> Optional[str]:
if self.cds_feature is None:
return None
else:
# default codon table is the standard table, aka "1"
# https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
return "1"

@property
def transl_except(self) -> Optional[List[str]]:
if self.cds_feature is None:
Expand Down
Binary file modified tests/data/txinfo.gz
Binary file not shown.
31 changes: 30 additions & 1 deletion tests/test_uta_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,19 @@ def test_load_txinfo(self):
type='protein-coding',
xrefs='MIM:610918,HGNC:HGNC:30397,Ensembl:ENSG00000198832,AllianceGenome:HGNC:30397',
)
g2 = usam.Gene(
gene_id='4514',
hgnc='MT-CO3',
symbol='MT-CO3',
maploc=None,
descr='mitochondrially encoded cytochrome c oxidase III',
summary='mitochondrially encoded cytochrome c oxidase III',
aliases='COIII,MTCO3',
type='protein-coding',
xrefs='GeneID:4514,HGNC:HGNC:7422,MIM:516050',
)
self.session.add(g1)

self.session.add(g2)
self.session.commit()

cf = configparser.ConfigParser()
Expand Down Expand Up @@ -207,6 +218,24 @@ def test_load_txinfo(self):
},
)

transcript = self.session.query(usam.Transcript).filter(usam.Transcript.ac == 'NC_012920.1_09206_09990').one()
self.assertEqual(
{
'ac': transcript.ac,
'gene_id': transcript.gene_id,
'cds_start_i': transcript.cds_start_i,
'cds_end_i': transcript.cds_end_i,
'codon_table': transcript.codon_table,
},
{
'ac': 'NC_012920.1_09206_09990',
'gene_id': '4514',
'cds_start_i': 0,
'cds_end_i': 784,
'codon_table': 2,
},
)

exon_set = self.session.query(usam.ExonSet).filter(usam.ExonSet.tx_ac == 'NM_080430.4').one()
exons = self.session.query(usam.Exon).filter(usam.Exon.exon_set_id == exon_set.exon_set_id).all()
self.assertEqual(len(exons), 5)
Expand Down
16 changes: 6 additions & 10 deletions tests/test_uta_parsers_seqrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class TestSeqRecordFacade(unittest.TestCase):
expected_cds_translation="MAESGRPWAQARSAYRASEVLRRGTGRRRDPGPQSNGPGQEDARAPGRMARLRGQLRAEAASRSEVPRLLKLVERAGAG" \
"AAGAGERTGAHSRGSVCSVCGEPRGGATYPAGVLEVSERRLQEGLAAVREELGAGIEALRAELRAELDALRALLPPPPSPPARREPRAVPRAAPRGPTLP" \
"RTLGTVSALVAASRPADDAPDGPAECGAHRAPARKNHKKMPVPPGAPQGGGD",
expected_codon_table="1",
expected_exons_se_i=[(0, 696)],
),
param(
Expand All @@ -49,6 +50,7 @@ class TestSeqRecordFacade(unittest.TestCase):
"EDIDECALPTGGHICSYRCINIPGSFQCSCPSSGYRLAPNGRNCQDIDECVTGIHNCSINETCFNIQGGFRCLAFECPENYRRSAATRCERLPCHENREC" \
"SKLPLRITYYHLSFPTNIQAPAVVFRMGPSSAVPGDSMQLAITGGNEEGFFTTRKVSPHSGVVALTKPVPEPRDLLLTVKMDLSRHGTVSSFVAKLFIFV" \
"SAEL",
expected_codon_table="1",
expected_exons_se_i=[
(0, 182),
(182, 288),
Expand Down Expand Up @@ -80,6 +82,7 @@ class TestSeqRecordFacade(unittest.TestCase):
expected_cds_product=None,
expected_cds_protein_id=None,
expected_cds_translation=None,
expected_codon_table=None,
expected_exons_se_i=[],
),
param(
Expand All @@ -95,6 +98,7 @@ class TestSeqRecordFacade(unittest.TestCase):
expected_cds_product=None,
expected_cds_protein_id=None,
expected_cds_translation=None,
expected_codon_table=None,
expected_exons_se_i=[],
),
])
Expand All @@ -112,6 +116,7 @@ def test_seq_record_facade(
expected_cds_product,
expected_cds_protein_id,
expected_cds_translation,
expected_codon_table,
expected_exons_se_i,
):
gbff_file = os.path.join(self.test_data_dir, file_name)
Expand All @@ -127,18 +132,9 @@ def test_seq_record_facade(
assert self.seq_record_facade.cds_product == expected_cds_product
assert self.seq_record_facade.cds_protein_id == expected_cds_protein_id
assert self.seq_record_facade.cds_translation == expected_cds_translation
assert self.seq_record_facade.codon_table == expected_codon_table
assert self.seq_record_facade.exons_se_i == expected_exons_se_i

# @parameterized.expand([ @parameterized.expand([
# param('no genes', features={}),
# param('no genes', features={'gene': []}),
# param('more than one gene', features={'gene': [Mock(), Mock()]}),
# param('more than one CDS', features={'CDS': [Mock(), Mock()]}),
# ])
# def test_validate_features_by_type_invalid(self, test_name, features):
# with self.assertRaises(SeqRecordFeatureError):
# SeqRecordFacade.validate_features_by_type(features)

@parameterized.expand([
param("no gene feature", gene_feature_mock={}),
param("gene feature is None", gene_feature_mock={"gene": None}),
Expand Down

0 comments on commit e8c811c

Please sign in to comment.