Merge branch 'master' of github.com:xguse/rSeqPipeline

xguse · Jan 30, 2013 · 04b635b · 04b635b
2 parents 576fe92 + 59277ff
commit 04b635b
Show file tree

Hide file tree

Showing 4 changed files with 107 additions and 9 deletions.
diff --git a/rSeq/scripts/unique_novel_genes_in_gtf_4mariangela.py b/rSeq/scripts/unique_novel_genes_in_gtf_4mariangela.py
@@ -0,0 +1,37 @@
+"""
+tool for mariangela to sort out the number of cufflinks novel
+genes who have at least one exon far from any annotated exon.
+"""
+
+import argparse
+
+
+def main():
+
+    desc = """tool for mariangela to sort out the number of cufflinks novel
+genes who have at least one exon far from any annotated exon."""
+
+    parser = argparse.ArgumentParser(description=desc)
+
+
+    parser.add_argument('gtf', type=str,
+                        help="""Path to gtf file. \n(default: %(default)s)""")
+
+    args = parser.parse_args()
+
+    xloc_set = set()
+
+    gtf = open(args.gtf,'rU')
+
+    for line in gtf:
+        line = line.strip('\n').split('\t')
+        comments = line[8].split('"')
+        xloc = comments[1]
+        xloc_set.add(xloc)
+
+    print 'Number of unique xloc symbols: %s' % (len(xloc_set))
+    print 'Unique xloc symbols:\n%s' % ('\n'.join(sorted(list(xloc_set))))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rSeq/utils/convert.py b/rSeq/utils/convert.py
@@ -164,17 +164,22 @@ def MB_2_gff3(resultTablePath,gff3Path):
     gff3_lines = []
 
     mb_table = tableFile2namedTuple(resultTablePath,sep='\t')
-
+    skipped = 0
     for line in mb_table:
-        gff3_seqid = align_feat.chr
+        chrm,left,right = line.locus.replace('-',':').split(':')
+        if int(left) < 1:
+            skipped += 1
+            continue
+        gff3_seqid = chrm 
         gff3_source = 'Cufflinks'
         gff3_type = 'Assembled Tx boundries'
         gff3_start = left
         gff3_end = right
         gff3_score = line.q_value
-        gff3_strand = strandConvertions[align_feat.seq_region_strand]
+        gff3_strand = '?'
         gff3_phase = '.'
-        gff3_attributes = 'ID=%s;Alias=%s' % (align_feat.dna_align_feature_id, align_feat.hit_name)
+        gff3_attributes = 'ID=%s;Alias=%s;Note=%s' % \
+            (line.tracking_id, line.nearest_ref_id, line.class_code)
 
         gff3_lines.append([gff3_seqid,
                            gff3_source,
@@ -184,8 +189,14 @@ def MB_2_gff3(resultTablePath,gff3Path):
                            gff3_score,
                            gff3_strand,
                            gff3_phase,
-                           gff3_attributes])        
+                           gff3_attributes])
+
+    gff3Out = open(gff3Path,'w')
+    for line in gff3_lines:
+        gff3Out.write('%s\n' % ('\t'.join(line)))
+    gff3Out.close()
 
+    return skipped
 
 def vectorBaseESTs_2_gff3(resultTablePath,gff3Path):
     """

diff --git a/rSeq/utils/files.py b/rSeq/utils/files.py
@@ -634,3 +634,36 @@ def renameChrom_in_SAM(path):
     except:
         raise
 
+
+def rename_fasta_headers(in_path,out_path,header_func):
+    """
+    GIVEN:
+        - in_path = path to original fasta file
+        - out_path = path to future altered fasta file
+        - header_func = function to take a header line and return an altered string version of it
+    DOES:
+        - Reads in in_path file one line at a time
+        - If the line is a fasta header (starts with '>')
+          uses header_func logic to rearrange the header and
+          writes out the changed line to out_path.
+        - If not a header, writes same line out to out_path.
+        - Closes both file objects.
+    RETURNS:
+        - None
+    """
+
+    in_file = open(in_path,'rU')
+    out_file = open(out_path,'w')
+
+    for line in in_file:
+        if line.startswith('>'):
+            line = header_func(line)
+            # Handle and ensure that each modified line has one and only one \n
+            line = line.rstrip('\n') + '\n' 
+        else:
+            pass
+
+        out_file.write(line)
+
+    in_file.close()
+    out_file.close()
diff --git a/rSeq/utils/motifDiscovery/motifs.py b/rSeq/utils/motifDiscovery/motifs.py
@@ -271,8 +271,16 @@ def toXMSmotif(self):
         this motif.
         """
         # ++ initialize xms motif string ++
-        xMtf = '<motif>\n\t<name>%s_%.3f</name>\n\t\t<weightmatrix alphabet="DNA" columns="%s">\n' %\
-             (self.consensus,float(self.sigvalue),len(self.pwm['A']))
+        try:
+            xMtf = '<motif>\n\t<name>%s_%.3f</name>\n\t\t<weightmatrix alphabet="DNA" columns="%s">\n' %\
+                 (self.consensus,float(self.sigvalue),len(self.pwm['A']))
+        except AttributeError as err:
+            if 'sigvalue' in str(err.message):
+                self.sigvalue = 'nan'
+                xMtf = '<motif>\n\t<name>%s_%.3f</name>\n\t\t<weightmatrix alphabet="DNA" columns="%s">\n' %\
+                             (self.consensus,float(self.sigvalue),len(self.pwm['A']))
+            else:
+                raise err
 
         # ++ create and add each column's data ++                  
         for i in range(len(self.pwm['A'])):
@@ -290,8 +298,17 @@ def toXMSmotif(self):
              (self.sigvalue)
         xMtf += '\t\t<prop>\n\t\t\t<key>rank</key>\n\t\t\t<value>%s</value></prop>\n' % \
              (self.rank)
-        xMtf += '\t\t<prop>\n\t\t\t<key>algorithm</key>\n\t\t\t<value>%s</value></prop>\n' % \
-             (self.algorithm)
+        try:
+            xMtf += '\t\t<prop>\n\t\t\t<key>algorithm</key>\n\t\t\t<value>%s</value></prop>\n' % \
+                 (self.algorithm)
+        except AttributeError as err:
+            if 'algorithm' in str(err.message):
+                self.algorithm = 'nan'
+                xMtf += '\t\t<prop>\n\t\t\t<key>algorithm</key>\n\t\t\t<value>%s</value></prop>\n' % \
+                     (self.algorithm)
+            else:
+                raise err
+
         xMtf += '</motif>\n'
 
         return xMtf