20240309

sculab · Mar 9, 2024 · 35c50da · 35c50da
1 parent ed63253
commit 35c50da
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 32 deletions.
diff --git a/CODE/Main_Form.vb b/CODE/Main_Form.vb
@@ -4112,8 +4112,7 @@ Public Class Main_Form
                 Dim my_options(2) As String
                 If form_config_consensus.ComboBox1.SelectedIndex = 0 Then
                     my_options(1) = "results"
-                    my_options(2) = "0"
-
+                    my_options(1) = "1"
                 Else
                     my_options(1) = "best_refs"
                     my_options(2) = "0"

diff --git a/scripts/build_consensus.py b/scripts/build_consensus.py
@@ -40,16 +40,16 @@ def parse_cigar(cigarstring, seq, pos_ref):
     return seqout, insert
 
 def parse_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''基于SAM文件构建一致性序列''')
-    parser.add_argument("-i", "--input", action="store", dest="filename", default=r"D:\Working\Develop\EasyMiner Develop\EasyMiner\bin\Debug\net6.0-windows\results\1_RAPiD_Genomics_F088_UFL_394803_P005_WC12_i5_515_i7_132_S988_L002_R_001\muticopy\g6660_mft.sam",
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''Constructing Consensus Sequences Based on SAM Files''')
+    parser.add_argument("-i", "--input", action="store", dest="filename", default=r"D:\Develop\GeneMiner Develop\GeneMiner\bin\Debug\net6.0-windows\results\Barcode_adapter_BA1.sam",
                         help="Name of the SAM file, SAM does not need to be sorted and can be compressed with gzip")
-    parser.add_argument("-c", "--consensus-thresholds", action="store", dest="thresholds", type=str, default="0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75",
+    parser.add_argument("-c", "--consensus-thresholds", action="store", dest="thresholds", type=str, default="0.25",
                         help="List of consensus threshold(s) separated by commas, no spaces, example: -c 0.25,0.75,0.50, default=0.75")
     parser.add_argument("-n", action="store", dest="n", type=int, default=0,
                         help="Split FASTA output sequences every n nucleotides, default=do not split sequence")
-    parser.add_argument("-o", "--outfolder", action="store", dest="outfolder", default=r"D:\Working\Develop\EasyMiner Develop\EasyMiner\bin\Debug\net6.0-windows\results\1_RAPiD_Genomics_F088_UFL_394803_P005_WC12_i5_515_i7_132_S988_L002_R_001\muticopy",
+    parser.add_argument("-o", "--outfolder", action="store", dest="outfolder", default=r"D:\Develop\GeneMiner Develop\GeneMiner\bin\Debug\net6.0-windows\results",
                         help="Name of output folder, default=same folder as input")
-    parser.add_argument("-p", "--prefix", action="store", dest="prefix", default="",
+    parser.add_argument("-p", "--prefix", action="store", dest="prefix", default="Barcode_adapter_BA1_tmp",
                         help="Prefix for output file name, default=input filename without .sam extension")
     parser.add_argument("-m", "--min-depth", action="store", dest="min_depth", type=int, default=2,
                         help="Minimum read depth at each site to report the nucleotide in the consensus, default=1")
@@ -58,7 +58,7 @@ def parse_args():
     parser.add_argument("-d", "--maxdel", action="store", dest="maxdel", default=150,
                         help="Ignore deletions longer than this value, default=150")
     parser.add_argument("-s", "--save_mutations", action="store", type=int, dest="save_mutations", default=1,
-                        help="保存每个位点碱基比例")
+                        help="Save the Base Composition for Each Locus")
     return parser.parse_args()
 
 def process_sam_header(mapfile):
@@ -90,25 +90,26 @@ def parse_sam_file(mapfile, sequences, coverages, insertions, refname, maxdel):
 
         for line in mapfile_chunk:
             reads_total += 1
-            if line[0] != "@" and line.split("\t")[5] != "*":
-                reads_mapped += 1
-                sam_record = line.split("\t")
-                refname = sam_record[2].split()[0]
-                pos_ref = int(sam_record[3]) - 1
-                seqout, insert = parse_cigar(sam_record[5], sam_record[9], pos_ref)
-                if seqout.count("-") <= maxdel:
-                    for nuc in seqout:
-                        try:
-                            sequences[refname][pos_ref][nuc] += 1
+            if line[0] != "\t":
+                if line[0] != "@" and line.split("\t")[5] != "*":
+                    reads_mapped += 1
+                    sam_record = line.split("\t")
+                    refname = sam_record[2].split()[0]
+                    pos_ref = int(sam_record[3]) - 1
+                    seqout, insert = parse_cigar(sam_record[5], sam_record[9], pos_ref)
+                    if seqout.count("-") <= maxdel:
+                        for nuc in seqout:
+                            try:
+                                sequences[refname][pos_ref][nuc] += 1
+                                pos_ref += 1
+                            except:
+                                print(refname, pos_ref, nuc, sam_record)
+                    else:
+                        for nuc in seqout:
+                            if nuc != "-" and nuc != "*":
+                                sequences[refname][pos_ref][nuc] += 1
                             pos_ref += 1
-                        except:
-                            print(refname, pos_ref, nuc, sam_record)
-                else:
-                    for nuc in seqout:
-                        if nuc != "-" and nuc != "*":
-                            sequences[refname][pos_ref][nuc] += 1
-                        pos_ref += 1
-                        # Show progress
+                            # Show progress
             if reads_total % 500000 == 0:
                 print((str(reads_total)+" reads processed.\r"))
     mapfile.close()

diff --git a/scripts/split_barcode.py b/scripts/split_barcode.py
@@ -114,7 +114,7 @@ def main():
     if os.path.isdir(args.input):
         files_to_process = []
         for filename in os.listdir(args.input):
-            if filename.endswith(".fa") or filename.endswith(".fas") or filename.endswith(".fasta") or filename.endswith(".fq"):
+            if filename.endswith(".fa") or filename.endswith(".fas") or filename.endswith(".fasta") or filename.endswith(".fq") or filename.endswith(".fastq"):
                 query_file = fq_to_fasta(os.path.join(args.input, filename))
                 out_folder = os.path.join(args.output, os.path.splitext(filename)[0])
                 if os.path.exists(out_folder):
@@ -129,7 +129,7 @@ def main():
             shutil.rmtree(combine_folder)
         os.makedirs(combine_folder, exist_ok=True)
         for filename in os.listdir(args.input):
-            if filename.endswith(".fa") or filename.endswith(".fas") or filename.endswith(".fasta") or filename.endswith(".fq"):
+            if filename.endswith(".fa") or filename.endswith(".fas") or filename.endswith(".fasta") or filename.endswith(".fq") or filename.endswith(".fastq"):
                 query_file = fq_to_fasta(os.path.join(args.input, filename))
                 seq_file_name = os.path.basename(os.path.splitext(query_file)[0])
                 out_folder = os.path.join(args.output, seq_file_name)
@@ -181,6 +181,12 @@ def fq_to_fasta(input_file):
     elif input_file.endswith('.fq'):
         open_func = open
         output_file = './' + input_file.replace("\\","/").split('/')[-1].replace('.fq', '.fasta')
+    elif input_file.endswith('.fastq.gz'):
+        open_func = gzip.open
+        output_file = './' + input_file.replace("\\","/").split('/')[-1].replace('.fastq.gz', '.fasta')
+    elif input_file.endswith('.fastq'):
+        open_func = open
+        output_file = './' + input_file.replace("\\","/").split('/')[-1].replace('.fastq', '.fasta')
     else:
         return input_file
 
@@ -231,7 +237,6 @@ def split_subreads(query_file, subject_file, word_size, seq_file_name, out_folde
             else:
                 splited_file.write(record)
     splited_file.close()
-
 def get_subreads_dict(blast_output_file, _subreads_dict):
     with open(blast_output_file, "r") as blast_output:
         lines = blast_output.readlines()

diff --git a/scripts/split_file.py b/scripts/split_file.py
@@ -10,10 +10,10 @@ def parse_args():
     return parser.parse_args()
 
 def save_fasta(record, output_dir):
-    record.id = str(record.id).replace('.','_').replace(' ','_').replace('__','_').replace('-', '_').replace(':', '_').replace('|', '_')
+    tmp_name = str(record.id).replace('.','_').replace(' ','_').replace('__','_').replace('-', '_').replace(':', '_').replace('|', '_')
     record.description = record.id
-    filename = os.path.join(output_dir, f"{str(record.id)}.fasta")
-    record.seq = record.seq.ungap()  # Remove gaps if present
+    filename = os.path.join(output_dir, f"{tmp_name}.fasta")
+    record.seq = record.seq.replace("-", "")  # Remove gaps if present
     with open(filename, "w") as output_handle:
         SeqIO.write(record, output_handle, "fasta-2line")