Skip to content

Commit

Permalink
20240309
Browse files Browse the repository at this point in the history
  • Loading branch information
yyscu committed Mar 9, 2024
1 parent ed63253 commit 35c50da
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 32 deletions.
3 changes: 1 addition & 2 deletions CODE/Main_Form.vb
Original file line number Diff line number Diff line change
Expand Up @@ -4112,8 +4112,7 @@ Public Class Main_Form
Dim my_options(2) As String
If form_config_consensus.ComboBox1.SelectedIndex = 0 Then
my_options(1) = "results"
my_options(2) = "0"

my_options(1) = "1"
Else
my_options(1) = "best_refs"
my_options(2) = "0"
Expand Down
49 changes: 25 additions & 24 deletions scripts/build_consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,16 @@ def parse_cigar(cigarstring, seq, pos_ref):
return seqout, insert

def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''基于SAM文件构建一致性序列''')
parser.add_argument("-i", "--input", action="store", dest="filename", default=r"D:\Working\Develop\EasyMiner Develop\EasyMiner\bin\Debug\net6.0-windows\results\1_RAPiD_Genomics_F088_UFL_394803_P005_WC12_i5_515_i7_132_S988_L002_R_001\muticopy\g6660_mft.sam",
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''Constructing Consensus Sequences Based on SAM Files''')
parser.add_argument("-i", "--input", action="store", dest="filename", default=r"D:\Develop\GeneMiner Develop\GeneMiner\bin\Debug\net6.0-windows\results\Barcode_adapter_BA1.sam",
help="Name of the SAM file, SAM does not need to be sorted and can be compressed with gzip")
parser.add_argument("-c", "--consensus-thresholds", action="store", dest="thresholds", type=str, default="0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75",
parser.add_argument("-c", "--consensus-thresholds", action="store", dest="thresholds", type=str, default="0.25",
help="List of consensus threshold(s) separated by commas, no spaces, example: -c 0.25,0.75,0.50, default=0.75")
parser.add_argument("-n", action="store", dest="n", type=int, default=0,
help="Split FASTA output sequences every n nucleotides, default=do not split sequence")
parser.add_argument("-o", "--outfolder", action="store", dest="outfolder", default=r"D:\Working\Develop\EasyMiner Develop\EasyMiner\bin\Debug\net6.0-windows\results\1_RAPiD_Genomics_F088_UFL_394803_P005_WC12_i5_515_i7_132_S988_L002_R_001\muticopy",
parser.add_argument("-o", "--outfolder", action="store", dest="outfolder", default=r"D:\Develop\GeneMiner Develop\GeneMiner\bin\Debug\net6.0-windows\results",
help="Name of output folder, default=same folder as input")
parser.add_argument("-p", "--prefix", action="store", dest="prefix", default="",
parser.add_argument("-p", "--prefix", action="store", dest="prefix", default="Barcode_adapter_BA1_tmp",
help="Prefix for output file name, default=input filename without .sam extension")
parser.add_argument("-m", "--min-depth", action="store", dest="min_depth", type=int, default=2,
help="Minimum read depth at each site to report the nucleotide in the consensus, default=1")
Expand All @@ -58,7 +58,7 @@ def parse_args():
parser.add_argument("-d", "--maxdel", action="store", dest="maxdel", default=150,
help="Ignore deletions longer than this value, default=150")
parser.add_argument("-s", "--save_mutations", action="store", type=int, dest="save_mutations", default=1,
help="保存每个位点碱基比例")
help="Save the Base Composition for Each Locus")
return parser.parse_args()

def process_sam_header(mapfile):
Expand Down Expand Up @@ -90,25 +90,26 @@ def parse_sam_file(mapfile, sequences, coverages, insertions, refname, maxdel):

for line in mapfile_chunk:
reads_total += 1
if line[0] != "@" and line.split("\t")[5] != "*":
reads_mapped += 1
sam_record = line.split("\t")
refname = sam_record[2].split()[0]
pos_ref = int(sam_record[3]) - 1
seqout, insert = parse_cigar(sam_record[5], sam_record[9], pos_ref)
if seqout.count("-") <= maxdel:
for nuc in seqout:
try:
sequences[refname][pos_ref][nuc] += 1
if line[0] != "\t":
if line[0] != "@" and line.split("\t")[5] != "*":
reads_mapped += 1
sam_record = line.split("\t")
refname = sam_record[2].split()[0]
pos_ref = int(sam_record[3]) - 1
seqout, insert = parse_cigar(sam_record[5], sam_record[9], pos_ref)
if seqout.count("-") <= maxdel:
for nuc in seqout:
try:
sequences[refname][pos_ref][nuc] += 1
pos_ref += 1
except:
print(refname, pos_ref, nuc, sam_record)
else:
for nuc in seqout:
if nuc != "-" and nuc != "*":
sequences[refname][pos_ref][nuc] += 1
pos_ref += 1
except:
print(refname, pos_ref, nuc, sam_record)
else:
for nuc in seqout:
if nuc != "-" and nuc != "*":
sequences[refname][pos_ref][nuc] += 1
pos_ref += 1
# Show progress
# Show progress
if reads_total % 500000 == 0:
print((str(reads_total)+" reads processed.\r"))
mapfile.close()
Expand Down
11 changes: 8 additions & 3 deletions scripts/split_barcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def main():
if os.path.isdir(args.input):
files_to_process = []
for filename in os.listdir(args.input):
if filename.endswith(".fa") or filename.endswith(".fas") or filename.endswith(".fasta") or filename.endswith(".fq"):
if filename.endswith(".fa") or filename.endswith(".fas") or filename.endswith(".fasta") or filename.endswith(".fq") or filename.endswith(".fastq"):
query_file = fq_to_fasta(os.path.join(args.input, filename))
out_folder = os.path.join(args.output, os.path.splitext(filename)[0])
if os.path.exists(out_folder):
Expand All @@ -129,7 +129,7 @@ def main():
shutil.rmtree(combine_folder)
os.makedirs(combine_folder, exist_ok=True)
for filename in os.listdir(args.input):
if filename.endswith(".fa") or filename.endswith(".fas") or filename.endswith(".fasta") or filename.endswith(".fq"):
if filename.endswith(".fa") or filename.endswith(".fas") or filename.endswith(".fasta") or filename.endswith(".fq") or filename.endswith(".fastq"):
query_file = fq_to_fasta(os.path.join(args.input, filename))
seq_file_name = os.path.basename(os.path.splitext(query_file)[0])
out_folder = os.path.join(args.output, seq_file_name)
Expand Down Expand Up @@ -181,6 +181,12 @@ def fq_to_fasta(input_file):
elif input_file.endswith('.fq'):
open_func = open
output_file = './' + input_file.replace("\\","/").split('/')[-1].replace('.fq', '.fasta')
elif input_file.endswith('.fastq.gz'):
open_func = gzip.open
output_file = './' + input_file.replace("\\","/").split('/')[-1].replace('.fastq.gz', '.fasta')
elif input_file.endswith('.fastq'):
open_func = open
output_file = './' + input_file.replace("\\","/").split('/')[-1].replace('.fastq', '.fasta')
else:
return input_file

Expand Down Expand Up @@ -231,7 +237,6 @@ def split_subreads(query_file, subject_file, word_size, seq_file_name, out_folde
else:
splited_file.write(record)
splited_file.close()

def get_subreads_dict(blast_output_file, _subreads_dict):
with open(blast_output_file, "r") as blast_output:
lines = blast_output.readlines()
Expand Down
6 changes: 3 additions & 3 deletions scripts/split_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ def parse_args():
return parser.parse_args()

def save_fasta(record, output_dir):
record.id = str(record.id).replace('.','_').replace(' ','_').replace('__','_').replace('-', '_').replace(':', '_').replace('|', '_')
tmp_name = str(record.id).replace('.','_').replace(' ','_').replace('__','_').replace('-', '_').replace(':', '_').replace('|', '_')
record.description = record.id
filename = os.path.join(output_dir, f"{str(record.id)}.fasta")
record.seq = record.seq.ungap() # Remove gaps if present
filename = os.path.join(output_dir, f"{tmp_name}.fasta")
record.seq = record.seq.replace("-", "") # Remove gaps if present
with open(filename, "w") as output_handle:
SeqIO.write(record, output_handle, "fasta-2line")

Expand Down

0 comments on commit 35c50da

Please sign in to comment.