Merge pull request #70 from VDBWRAIR/dev

Next release
VDBWRAIR · Jan 4, 2016 · 5316608 · 5316608
2 parents 2a110c2 + b61a578
commit 5316608
Show file tree

Hide file tree

Showing 71 changed files with 2,807 additions and 126 deletions.
diff --git a/.gitignore b/.gitignore
@@ -57,7 +57,6 @@ report.html
 
 # Sphinx documentation
 docs/_build/
-docs/_static/
 docs/_templates/
 
 # PyBuilder

diff --git a/.travis.yml b/.travis.yml
@@ -10,12 +10,7 @@ install:
     - pip install -r tests/requirements.txt
     - python setup.py develop
 script:
-    - nosetests tests --with-coverage --cover-erase --cover-package=bio_pieces
+    - nosetests tests --with-coverage --cover-erase --cover-package=bio_bits -a '!download'
     - pybot tests/*.robot
 after_success:
-    - coveralls
-
-notifications:
-   email:
-      recipients:
-         - [email protected]
+    - coveralls 
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,15 @@
 CHANGELOG
 =========
 
+Version 1.2.0
+-------------
+
+* Renamed project to bio_bits to fix naming issue with other project
+* GPL License added
+* degen_regions script added
+* parallel_blast added
+* plot_muts script added
+
 Version 1.1.0
 -------------
 

diff --git a/COPYRIGHT b/COPYRIGHT
@@ -0,0 +1,17 @@
+bio_bits is collection of bioinformatics scripts and libraries written in Python
+Copyright (C) 2015 Cherokee Nation Technology Solutions, LLC
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
diff --git a/Den4_MAAPS_TestData16.fasta b/Den4_MAAPS_TestData16.fasta
diff --git a/README.rst b/README.rst
@@ -1,15 +1,15 @@
-bio_pieces
+bio_bits
 ==========
 
 .. image:: https://readthedocs.org/projects/bio-pieces/badge/?version=latest
     :target: http://bio-pieces.readthedocs.org/en/latest/
     :alt: Documentation Status
 
-.. image:: https://travis-ci.org/VDBWRAIR/bio_pieces.svg
-    :target: https://travis-ci.org/VDBWRAIR/bio_pieces
+.. image:: https://travis-ci.org/VDBWRAIR/bio_bits.svg
+    :target: https://travis-ci.org/VDBWRAIR/bio_bits
 
-.. image:: https://coveralls.io/repos/VDBWRAIR/bio_pieces/badge.svg
-  :target: https://coveralls.io/r/VDBWRAIR/bio_pieces
+.. image:: https://coveralls.io/repos/VDBWRAIR/bio_bits/badge.svg
+  :target: https://coveralls.io/r/VDBWRAIR/bio_bits
 
 Various bioinformatics scripts
 

diff --git a/bio_bits/__init__.py b/bio_bits/__init__.py
@@ -0,0 +1,6 @@
+__version__ = '1.2.0'
+__release__ = __version__
+__authors__ = 'Tyghe Vallard, Michael Panciera'
+__authoremails__ = '[email protected], [email protected]'
+__description__ = 'bio_bits is a collection of bioinformatics scripts and libraries written in Python. The goal of the project is to make common bioinformatics tasks easier to perform and to provide useful libraries to for inclusion in more complex bioinformatics libraries and scripts.'
+__projectname__ = 'bio_bits'
diff --git a/bio_pieces/amos.py → bio_bits/amos.py b/bio_pieces/amos.py → bio_bits/amos.py
diff --git a/bio_pieces/amos2fastq.py → bio_bits/amos2fastq.py b/bio_pieces/amos2fastq.py → bio_bits/amos2fastq.py
@@ -11,7 +11,7 @@
 from Bio import SeqIO
 import itertools
 import pandas as pd
-from bio_pieces import amos
+from bio_bits import amos
 ''' Python3 compatibility '''
 from past.builtins import map , filter
 

diff --git a/bio_pieces/amos2fastq_main.py → bio_bits/amos2fastq_main.py b/bio_pieces/amos2fastq_main.py → bio_bits/amos2fastq_main.py
@@ -6,7 +6,7 @@
 '''
 from schema import Schema, Use, And
 from docopt import docopt
-from bio_pieces import amos2fastq
+from bio_bits import amos2fastq
 #Do file validation immediately when script is started
 def all_elemnts_unique(collection):
     return len(collection) == len(set(collection))

diff --git a/bio_pieces/beast_checkpoint.py → bio_bits/beast_checkpoint.py b/bio_pieces/beast_checkpoint.py → bio_bits/beast_checkpoint.py
diff --git a/bio_pieces/beast_wrapper.py → bio_bits/beast_wrapper.py b/bio_pieces/beast_wrapper.py → bio_bits/beast_wrapper.py
diff --git a/bio_bits/compat.py b/bio_bits/compat.py
@@ -0,0 +1,36 @@
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+try:
+    from BytesIO import BytesIO
+except ImportError:
+    from io import BytesIO
+
+from future.builtins import map, filter, zip
+
+try:
+    import unittest2 as unittest
+except ImportError:
+    import unittest
+
+
+try:
+    from functools import reduce
+except:
+    pass
+
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
+
+try:
+    from __builtin__ import open
+except ImportError:
+    from builtins import open
+
+# Tests directory
+from os.path import dirname
+THIS = dirname(__file__)
diff --git a/bio_bits/ctleptop.py b/bio_bits/ctleptop.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+ctleptop.py -i [FASTA FILE] > Out_file.txt
+
+Created by Dereje Jima on May 21, 2015
+"""
+from __future__ import division
+from __future__ import print_function
+from Bio.Seq import *
+from Bio.Alphabet import IUPAC
+from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
+#from itertools import groupby
+from Bio.Data import CodonTable
+from Bio.Data.IUPACData import ambiguous_dna_values
+#import yaml
+import argparse
+from bio_bits import degen
+from functools import partial
+from tabulate import tabulate
+from bio_bits.compat import zip
+import re
+import sys
+
+__docformat__ = "restructuredtext en"
+
+AMBICODON = {"R": ["A", "G"], "Y": ["C", "T"],
+             "W": ["A", "T"], "S": ["G", "C"],
+             "K": ["T", "G"],
+             "M": ["C", "A"], "D": ["A", "T", "G"],
+             "V": ["A", "C", "G"], "H": ["A", "C", "T"],
+             "B": ["C", "G", "T"], "N": ["A", "C", "T", "G"]}
+
+def getNearbyChars(nt):
+    """(str)->(list)
+    >>>getNearbyChars("R")
+    ['A', 'G']
+    >>>getNearbyChars("Y")
+    ['C', 'T']
+    >>>getNearbyChars("A")
+    ['A']
+    """
+    return AMBICODON.get(nt) or nt
+
+def nearbyPermutations(letters, index=0):
+    """(str)->(set)
+    >>>nearbyPermutations("AAR")
+    set(['AAG', 'AAA'])
+    >>>nearbyPermutations("ARR")
+    set(['AGG', 'AAG', 'AAA', 'AGA'])
+    nearbyPermutations("AAA")
+    set(['AAA'])
+    """
+    if (index >= len(letters)):
+        return set([''])
+    subWords = nearbyPermutations(letters, index + 1)
+    nearbyLetters = getNearbyChars(letters[index])
+    return permutations(subWords, nearbyLetters)
+
+def permutations(subWords, nearbyLetters):
+    """(set, list) -> (set)
+    >>>permutations(set(['CA']), ['A', 'T'])
+    set(['ACA', 'TCA'])
+    """
+    permutations = set()
+    for subWord in subWords:
+        for letter in nearbyLetters:
+            permutations.add(letter + subWord)
+    return permutations
+
+def getaalist(codonlist):
+    """(list) -> (list)
+    Return aa list from a a given nt codon list.
+    >>>getaalist(['AAA','ACT'])
+    ['K', 'T']
+    """
+    aalist = []
+    for codon in codonlist:
+        aa = Seq(codon, IUPAC.unambiguous_dna)
+        aa = str(translate(aa))
+        aalist.append(aa)
+    return aalist
+
+def list_overlap(list1, list2):
+    """(str, list) -> bool
+    Return True  if the two list hava element that overlaps.
+
+    >>>list_overlap('RAC',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
+    True
+    >>>list_overlap('ACT',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
+    False
+
+    """
+    for i in list1:
+        if i in list2:
+            return True
+    return False
+
+def access_mixed_aa(file_name):
+    """(str) ->(list,list,list,list).
+    Return a list of amino acide code for ambiguous dna codon, position of
+    ambiguous nt codon, aa name,seq id from fasta header  by reading multifasta
+    nucleotide fasta file
+    """
+    from Bio import SeqIO
+    aa = []
+    nucleotide_idx = []
+    nucl_codon = []
+    seqids = []
+    for seq_record in SeqIO.parse(file_name, 'fasta'):
+        seq_id = seq_record.id
+        seqline = str(seq_record.seq)
+        seqline = seqline.replace("-", "N")
+        n = 3
+        codon_list = dict( (i + n , seqline[i:i + n]) for i in range(0, len(seqline), n))
+        ambi_nucl = AMBICODON.keys()
+        for key, codon in sorted(codon_list.items()):
+            if list_overlap(codon, ambi_nucl):
+                d, e, f = codon
+                m = [d, e, f]
+                items = [i for i in m if i in ambi_nucl]
+                indexm = m.index(items[0])
+                for idx, val in enumerate(items):
+                    codonlist = list(nearbyPermutations(codon))
+                    val = getaalist(codonlist)
+                    # remove if aa codon is the same eg. ['D', 'D']
+                    val = set(val)
+                    val = "/".join(sorted(val))   # yeild 'I/L'
+
+                    key = key - 2 + indexm
+                    if '/' in val:
+                        nucleotide_idx.append(key)
+                        nucl_codon.append(codon)
+                        seqids.append(seq_id)
+#                    if "/" in val and indexm == 2:
+#                        key = key
+#                        nucleotide_idx.append(key)
+#                        nucl_codon.append(codon)
+#                        seqids.append(seq_id)
+#                    elif "/" in val and indexm == 1:
+#                        key = key - 1
+#                        nucleotide_idx.append(key)
+#                        nucl_codon.append(codon)
+#                        seqids.append(seq_id)
+#                    elif "/" in val and indexm == 0:
+#                        key = key - 2
+#                        nucleotide_idx.append(key)
+#                        nucl_codon.append(codon)
+#                        seqids.append(seq_id)
+#                    else:
+#                        pass
+                    aa.append(val)
+
+            else:
+                # print "codon3 ..." ,codon
+                aa1 = Seq(codon, IUPAC.unambiguous_dna)
+                aa1 = aa1.translate()
+                aa1 = str(aa1)
+                aa.append(aa1)
+    #print aa, nucleotide_idx, nucl_codon, seqids
+    return aa, nucleotide_idx, nucl_codon, seqids
+
+
+def create_args():
+    """
+    Return command line arguments
+
+    """
+    parser = argparse.ArgumentParser(
+        description='Convert inframe nucleotide \
+             fasta file to protein and report mixed \
+             (ambiguous codon) with its location in \
+             the sequence',
+        epilog = '%(prog)s -i tests/Den4_MAAPS_TestData16.fasta -o out_file.txt'
+    )
+    g = parser.add_mutually_exclusive_group(required=True)
+    parser.add_argument("-i", type=str, help="Nucleotide fasta file", required=True)
+    parser.add_argument("-o", type=str,  help="output file name", required=True)
+    g.add_argument("--gb-file", type=str,  help="genbank file name")
+    g.add_argument("--gb-id", type=str,  help="genabnk accession id")
+    g.add_argument("--tab-file", type=str,  help="gene tab/csv file")
+    parser.add_argument('--cds', type=str, help="CDS start stop[start,stop]")
+    return parser.parse_args()
+
+def mod_entry(entry, cds):
+    '''
+    Find Gap positions and non-coding region positions
+    :param entry: iterable of (seqid,nucindex,aaindex,nuclcodon,aacodon,genename)
+    :cds: Gene of CDS info
+    :return: entry modified to reflect gap or non-coding
+    '''
+    new_entry = list(entry)
+    nuc_pos = entry[1]
+    nt = entry[3]
+    if cds.start >= nuc_pos or cds.end <= nuc_pos:
+        new_entry[4] = 'NON-CODING'
+    elif 'N' in nt:
+        new_entry[4] = 'GAPFOUND'
+    return tuple(new_entry)
+
+def main():
+    args = create_args()
+    file_name = args.i
+    outfile = args.o
+
+    with open(outfile, 'w+') as outf:
+        aa, nuc_idx, nucl_codon, seqids = access_mixed_aa(file_name)
+
+        # Get Gene info
+        reference_genes, cds = degen.get_genes(args.gb_id, args.gb_file, args.tab_file)
+        overlapped_genes = degen.get_degen_list_overlap(reference_genes, nuc_idx)
+
+        # Remove all non-mixed positions
+        amb_aa_codon = filter(lambda x: '/' in x, aa)
+        # get amino acid index list
+        amb_aa_indx = map(lambda x: x//3 + 1, nuc_idx)
+
+        mixed_positions = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes)
+        if args.cds:
+            cds_start, cds_end = map(int, args.cds.split(','))
+            cds = degen.Gene('CDS', cds_start, cds_end)
+
+        if cds is None:
+            print("No CDS information supplied via input file or on command line")
+            sys.exit(1)
+
+        # mark gaps and non-coding positions
+        mixed_positions= map(lambda x: mod_entry(x, cds), mixed_positions)
+        outf.write(
+            tabulate(
+                mixed_positions,
+                headers=[
+                    'seq id', 'nt Position', 'aa position',
+                    'nt composition', 'aa composition', 'gene name'
+                ]
+            ) + "\n"
+        )
+
+if __name__ == '__main__':
+    main()