-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #70 from VDBWRAIR/dev
Next release
- Loading branch information
Showing
71 changed files
with
2,807 additions
and
126 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,7 +57,6 @@ report.html | |
|
||
# Sphinx documentation | ||
docs/_build/ | ||
docs/_static/ | ||
docs/_templates/ | ||
|
||
# PyBuilder | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,12 +10,7 @@ install: | |
- pip install -r tests/requirements.txt | ||
- python setup.py develop | ||
script: | ||
- nosetests tests --with-coverage --cover-erase --cover-package=bio_pieces | ||
- nosetests tests --with-coverage --cover-erase --cover-package=bio_bits -a '!download' | ||
- pybot tests/*.robot | ||
after_success: | ||
- coveralls | ||
|
||
notifications: | ||
email: | ||
recipients: | ||
- [email protected] | ||
- coveralls |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
bio_bits is collection of bioinformatics scripts and libraries written in Python | ||
Copyright (C) 2015 Cherokee Nation Technology Solutions, LLC | ||
|
||
|
||
This program is free software; you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation; either version 2 of the License, or | ||
(at your option) any later version. | ||
|
||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
|
||
You should have received a copy of the GNU General Public License along | ||
with this program; if not, write to the Free Software Foundation, Inc., | ||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
__version__ = '1.2.0' | ||
__release__ = __version__ | ||
__authors__ = 'Tyghe Vallard, Michael Panciera' | ||
__authoremails__ = '[email protected], [email protected]' | ||
__description__ = 'bio_bits is a collection of bioinformatics scripts and libraries written in Python. The goal of the project is to make common bioinformatics tasks easier to perform and to provide useful libraries to for inclusion in more complex bioinformatics libraries and scripts.' | ||
__projectname__ = 'bio_bits' |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
try: | ||
from StringIO import StringIO | ||
except ImportError: | ||
from io import StringIO | ||
|
||
try: | ||
from BytesIO import BytesIO | ||
except ImportError: | ||
from io import BytesIO | ||
|
||
from future.builtins import map, filter, zip | ||
|
||
try: | ||
import unittest2 as unittest | ||
except ImportError: | ||
import unittest | ||
|
||
|
||
try: | ||
from functools import reduce | ||
except: | ||
pass | ||
|
||
try: | ||
from collections import OrderedDict | ||
except ImportError: | ||
from ordereddict import OrderedDict | ||
|
||
try: | ||
from __builtin__ import open | ||
except ImportError: | ||
from builtins import open | ||
|
||
# Tests directory | ||
from os.path import dirname | ||
THIS = dirname(__file__) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,240 @@ | ||
#!/usr/bin/env python | ||
# encoding: utf-8 | ||
""" | ||
ctleptop.py -i [FASTA FILE] > Out_file.txt | ||
Created by Dereje Jima on May 21, 2015 | ||
""" | ||
from __future__ import division | ||
from __future__ import print_function | ||
from Bio.Seq import * | ||
from Bio.Alphabet import IUPAC | ||
from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna | ||
#from itertools import groupby | ||
from Bio.Data import CodonTable | ||
from Bio.Data.IUPACData import ambiguous_dna_values | ||
#import yaml | ||
import argparse | ||
from bio_bits import degen | ||
from functools import partial | ||
from tabulate import tabulate | ||
from bio_bits.compat import zip | ||
import re | ||
import sys | ||
|
||
__docformat__ = "restructuredtext en" | ||
|
||
AMBICODON = {"R": ["A", "G"], "Y": ["C", "T"], | ||
"W": ["A", "T"], "S": ["G", "C"], | ||
"K": ["T", "G"], | ||
"M": ["C", "A"], "D": ["A", "T", "G"], | ||
"V": ["A", "C", "G"], "H": ["A", "C", "T"], | ||
"B": ["C", "G", "T"], "N": ["A", "C", "T", "G"]} | ||
|
||
def getNearbyChars(nt): | ||
"""(str)->(list) | ||
>>>getNearbyChars("R") | ||
['A', 'G'] | ||
>>>getNearbyChars("Y") | ||
['C', 'T'] | ||
>>>getNearbyChars("A") | ||
['A'] | ||
""" | ||
return AMBICODON.get(nt) or nt | ||
|
||
def nearbyPermutations(letters, index=0): | ||
"""(str)->(set) | ||
>>>nearbyPermutations("AAR") | ||
set(['AAG', 'AAA']) | ||
>>>nearbyPermutations("ARR") | ||
set(['AGG', 'AAG', 'AAA', 'AGA']) | ||
nearbyPermutations("AAA") | ||
set(['AAA']) | ||
""" | ||
if (index >= len(letters)): | ||
return set(['']) | ||
subWords = nearbyPermutations(letters, index + 1) | ||
nearbyLetters = getNearbyChars(letters[index]) | ||
return permutations(subWords, nearbyLetters) | ||
|
||
def permutations(subWords, nearbyLetters): | ||
"""(set, list) -> (set) | ||
>>>permutations(set(['CA']), ['A', 'T']) | ||
set(['ACA', 'TCA']) | ||
""" | ||
permutations = set() | ||
for subWord in subWords: | ||
for letter in nearbyLetters: | ||
permutations.add(letter + subWord) | ||
return permutations | ||
|
||
def getaalist(codonlist): | ||
"""(list) -> (list) | ||
Return aa list from a a given nt codon list. | ||
>>>getaalist(['AAA','ACT']) | ||
['K', 'T'] | ||
""" | ||
aalist = [] | ||
for codon in codonlist: | ||
aa = Seq(codon, IUPAC.unambiguous_dna) | ||
aa = str(translate(aa)) | ||
aalist.append(aa) | ||
return aalist | ||
|
||
def list_overlap(list1, list2): | ||
"""(str, list) -> bool | ||
Return True if the two list hava element that overlaps. | ||
>>>list_overlap('RAC',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y']) | ||
True | ||
>>>list_overlap('ACT',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y']) | ||
False | ||
""" | ||
for i in list1: | ||
if i in list2: | ||
return True | ||
return False | ||
|
||
def access_mixed_aa(file_name): | ||
"""(str) ->(list,list,list,list). | ||
Return a list of amino acide code for ambiguous dna codon, position of | ||
ambiguous nt codon, aa name,seq id from fasta header by reading multifasta | ||
nucleotide fasta file | ||
""" | ||
from Bio import SeqIO | ||
aa = [] | ||
nucleotide_idx = [] | ||
nucl_codon = [] | ||
seqids = [] | ||
for seq_record in SeqIO.parse(file_name, 'fasta'): | ||
seq_id = seq_record.id | ||
seqline = str(seq_record.seq) | ||
seqline = seqline.replace("-", "N") | ||
n = 3 | ||
codon_list = dict( (i + n , seqline[i:i + n]) for i in range(0, len(seqline), n)) | ||
ambi_nucl = AMBICODON.keys() | ||
for key, codon in sorted(codon_list.items()): | ||
if list_overlap(codon, ambi_nucl): | ||
d, e, f = codon | ||
m = [d, e, f] | ||
items = [i for i in m if i in ambi_nucl] | ||
indexm = m.index(items[0]) | ||
for idx, val in enumerate(items): | ||
codonlist = list(nearbyPermutations(codon)) | ||
val = getaalist(codonlist) | ||
# remove if aa codon is the same eg. ['D', 'D'] | ||
val = set(val) | ||
val = "/".join(sorted(val)) # yeild 'I/L' | ||
|
||
key = key - 2 + indexm | ||
if '/' in val: | ||
nucleotide_idx.append(key) | ||
nucl_codon.append(codon) | ||
seqids.append(seq_id) | ||
# if "/" in val and indexm == 2: | ||
# key = key | ||
# nucleotide_idx.append(key) | ||
# nucl_codon.append(codon) | ||
# seqids.append(seq_id) | ||
# elif "/" in val and indexm == 1: | ||
# key = key - 1 | ||
# nucleotide_idx.append(key) | ||
# nucl_codon.append(codon) | ||
# seqids.append(seq_id) | ||
# elif "/" in val and indexm == 0: | ||
# key = key - 2 | ||
# nucleotide_idx.append(key) | ||
# nucl_codon.append(codon) | ||
# seqids.append(seq_id) | ||
# else: | ||
# pass | ||
aa.append(val) | ||
|
||
else: | ||
# print "codon3 ..." ,codon | ||
aa1 = Seq(codon, IUPAC.unambiguous_dna) | ||
aa1 = aa1.translate() | ||
aa1 = str(aa1) | ||
aa.append(aa1) | ||
#print aa, nucleotide_idx, nucl_codon, seqids | ||
return aa, nucleotide_idx, nucl_codon, seqids | ||
|
||
|
||
def create_args(): | ||
""" | ||
Return command line arguments | ||
""" | ||
parser = argparse.ArgumentParser( | ||
description='Convert inframe nucleotide \ | ||
fasta file to protein and report mixed \ | ||
(ambiguous codon) with its location in \ | ||
the sequence', | ||
epilog = '%(prog)s -i tests/Den4_MAAPS_TestData16.fasta -o out_file.txt' | ||
) | ||
g = parser.add_mutually_exclusive_group(required=True) | ||
parser.add_argument("-i", type=str, help="Nucleotide fasta file", required=True) | ||
parser.add_argument("-o", type=str, help="output file name", required=True) | ||
g.add_argument("--gb-file", type=str, help="genbank file name") | ||
g.add_argument("--gb-id", type=str, help="genabnk accession id") | ||
g.add_argument("--tab-file", type=str, help="gene tab/csv file") | ||
parser.add_argument('--cds', type=str, help="CDS start stop[start,stop]") | ||
return parser.parse_args() | ||
|
||
def mod_entry(entry, cds): | ||
''' | ||
Find Gap positions and non-coding region positions | ||
:param entry: iterable of (seqid,nucindex,aaindex,nuclcodon,aacodon,genename) | ||
:cds: Gene of CDS info | ||
:return: entry modified to reflect gap or non-coding | ||
''' | ||
new_entry = list(entry) | ||
nuc_pos = entry[1] | ||
nt = entry[3] | ||
if cds.start >= nuc_pos or cds.end <= nuc_pos: | ||
new_entry[4] = 'NON-CODING' | ||
elif 'N' in nt: | ||
new_entry[4] = 'GAPFOUND' | ||
return tuple(new_entry) | ||
|
||
def main(): | ||
args = create_args() | ||
file_name = args.i | ||
outfile = args.o | ||
|
||
with open(outfile, 'w+') as outf: | ||
aa, nuc_idx, nucl_codon, seqids = access_mixed_aa(file_name) | ||
|
||
# Get Gene info | ||
reference_genes, cds = degen.get_genes(args.gb_id, args.gb_file, args.tab_file) | ||
overlapped_genes = degen.get_degen_list_overlap(reference_genes, nuc_idx) | ||
|
||
# Remove all non-mixed positions | ||
amb_aa_codon = filter(lambda x: '/' in x, aa) | ||
# get amino acid index list | ||
amb_aa_indx = map(lambda x: x//3 + 1, nuc_idx) | ||
|
||
mixed_positions = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes) | ||
if args.cds: | ||
cds_start, cds_end = map(int, args.cds.split(',')) | ||
cds = degen.Gene('CDS', cds_start, cds_end) | ||
|
||
if cds is None: | ||
print("No CDS information supplied via input file or on command line") | ||
sys.exit(1) | ||
|
||
# mark gaps and non-coding positions | ||
mixed_positions= map(lambda x: mod_entry(x, cds), mixed_positions) | ||
outf.write( | ||
tabulate( | ||
mixed_positions, | ||
headers=[ | ||
'seq id', 'nt Position', 'aa position', | ||
'nt composition', 'aa composition', 'gene name' | ||
] | ||
) + "\n" | ||
) | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.