From f5367fa58fce7f4c88aa74c57e4f463432efb8ba Mon Sep 17 00:00:00 2001 From: kalilamali Date: Sat, 11 Sep 2021 18:24:33 -0500 Subject: [PATCH] updated with comments --- README.md | 5 ++ get_mr0_test_cases.py | 33 +++++++++++-- get_mr1_test_cases.py | 34 ++++++++++++-- get_mr2_test_cases.py | 33 +++++++++++-- group_intfinder.py | 105 ++++++++++++++++++++++++++---------------- mrs.py | 23 +++++++-- 6 files changed, 181 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index e0481ca..45cea7f 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,11 @@ python3 mrs.py python3 mrs.py > report.tsv ``` # FAQ: + +group_intfinder.py uses kma's location:\ +/usr/bin/kma\ +if kma is not in that location please update the script. + If you get:\ ModuleNotFoundError: No module named 'tabulate'\ ModuleNotFoundError: No module named 'cgecore'\ diff --git a/get_mr0_test_cases.py b/get_mr0_test_cases.py index ae8582a..0a9876c 100644 --- a/get_mr0_test_cases.py +++ b/get_mr0_test_cases.py @@ -1,17 +1,42 @@ #! /usr/bin/env python3 +""" +get_mr0_test_cases.py +Creates fastq files with dummy scores from a folder containing fasta files. +Author Loaiza, K. +Comments Created: September, 2021. +""" + +# Libraries import os from Bio import SeqIO +# Folders in_dir = 'validation_seqs' out_dir = 'mr0_test_cases' -os.mkdir(out_dir) +# Create the new output folder +if not os.path.exists(out_dir): + os.mkdir(out_dir) +# Remove the old output folder +else: + os.remove(out_dir) + +# Get fasta files files_list = os.listdir(in_dir) files_list = [file for file in files_list if file.endswith('.fasta')] + for file in files_list: + # Get filename without .fasta + # In1538_MF974580.fasta > In1538_MF974580 name = os.path.splitext(file)[0] + # Get filepath + # validation_seqs/In1538_MF974580.fasta old_file = os.path.join(in_dir, file) - for r in SeqIO.parse(old_file, "fasta"): - r.letter_annotations["solexa_quality"] = [40] * len(r) + # Open the fasta file + for r in SeqIO.parse(old_file, 'fasta'): + # Assign dummy quality scores + r.letter_annotations['solexa_quality'] = [40] * len(r) + # Get the new filepath new_file = os.path.join(out_dir, name) - SeqIO.write(r, f"{new_file}.fastq", "fastq") + # Save the fasta with quality scores (fastq) + SeqIO.write(r, f'{new_file}.fastq', 'fastq') diff --git a/get_mr1_test_cases.py b/get_mr1_test_cases.py index af378a7..e70ea74 100644 --- a/get_mr1_test_cases.py +++ b/get_mr1_test_cases.py @@ -1,27 +1,55 @@ #! /usr/bin/env python3 +""" +get_mr1_test_cases.py +Creates fastq files with permutated reads from a folder containing fastq files. +Author Loaiza, K. +Comments Created: September, 2021. +""" + +# Libraries import os import random import numpy as np from Bio import SeqIO +# Folders in_dir = 'mr0_test_cases' out_dir = 'mr1_test_cases' -os.mkdir(out_dir) +# Create the new output folder +if not os.path.exists(out_dir): + os.mkdir(out_dir) +# Remove the old output folder +else: + os.remove(out_dir) + +# Get fastq files files_list = os.listdir(in_dir) files_list = [file for file in files_list if file.endswith('.fastq')] + for file in files_list: + # Get filename without .fastq + # In1538_MF974580.fastq > In1538_MF974580 name = os.path.splitext(file)[0] + # Get filepath + # mr0_test_cases/In1538_MF974580.fastq old_file = os.path.join(in_dir, file) - for r in SeqIO.parse(old_file, "fastq"): + # Open the fastq file + for r in SeqIO.parse(old_file, 'fastq'): + # Get the lenght of the fastq sequence P = len(r.seq) + # Get I random indices I = 2 indices = np.sort(np.random.randint(P, size = I)) print(f'{r.id} size {P} split at {indices}') + # Get the chunks of sequences s1 = r[:indices[0]] s2 = r[indices[0]+1:indices[1]] s3 = r[indices[1]+1:] + # Randomly shuffle the chunks of sequences records = [s1, s2, s3] random.shuffle(records) + # Get the new filepath new_file = os.path.join(out_dir, name) - SeqIO.write(records, f"{new_file}.fastq", "fastq") + # Save the chunks of sequences in one fastq file + SeqIO.write(records, f'{new_file}.fastq', 'fastq') diff --git a/get_mr2_test_cases.py b/get_mr2_test_cases.py index 3c9b8b6..13e4597 100644 --- a/get_mr2_test_cases.py +++ b/get_mr2_test_cases.py @@ -1,26 +1,53 @@ #! /usr/bin/env python3 +""" +get_mr2_test_cases.py +Creates fastq files with duplicated reads from a folder containing fastq files. +Author Loaiza, K. +Comments Created: September 11, 2021. +""" + +# Libraries import os import numpy as np from Bio import SeqIO +# Folders in_dir = 'mr0_test_cases' out_dir = 'mr2_test_cases' -os.mkdir(out_dir) +# Create the new output folder +if not os.path.exists(out_dir): + os.mkdir(out_dir) +# Remove the old output folder +else: + os.remove(out_dir) + +# Get fastq files files_list = os.listdir(in_dir) files_list = [file for file in files_list if file.endswith('.fastq')] + for file in files_list: + # Get filename without .fastq + # In1538_MF974580.fastq > In1538_MF974580 name = os.path.splitext(file)[0] + # Get filepath + # mr0_test_cases/In1538_MF974580.fastq old_file = os.path.join(in_dir, file) - for r in SeqIO.parse(old_file, "fastq"): + # Open the fastq file + for r in SeqIO.parse(old_file, 'fastq'): + # Get the lenght of the fastq sequence P = len(r.seq) + # Get I random indices I = 1 indices = np.sort(np.random.randint(P, size = I)) print(f'{r.id} size {P} split at {indices}') + # Get the chunks of sequences s1 = r[:indices[0]] s2 = r[indices[0]+1:] s3 = r[:indices[0]] s4 = r[indices[0]+1:] records = [s1, s2, s3, s4] + # Get the new filepath new_file = os.path.join(out_dir, name) - SeqIO.write(records, f"{new_file}.fastq", "fastq") + # Save the chunks of sequences in one fastq file + SeqIO.write(records, f'{new_file}.fastq', 'fastq') diff --git a/group_intfinder.py b/group_intfinder.py index 6996476..90678cf 100644 --- a/group_intfinder.py +++ b/group_intfinder.py @@ -1,66 +1,93 @@ #!/usr/bin/env python3 - -"""Group_intfinder.py -This program runs intfinder for a folder containing many fastq files. - -Author K.Loaiza -Comments Created: Thursday, April 18, 2020""" - -import argparse +""" +group_intfinder.py +Runs intfinder for a folder containing fastq files. +Author Loaiza, K. +Comments Created: April, 2020. +""" + +# Libraries import os +import argparse import tempfile - from subprocess import check_call # command tool +# Menu parser = argparse.ArgumentParser() parser.add_argument('--in_dir', help='folder with input sequences') parser.add_argument('--out_dir', help='folder to place the results') - +# Functions def get_files(in_dir): - """Function that takes an input folder and returns a list of all the fasta - files inside""" + """ + Function that takes an input folder and returns a list of all the fastq + files inside + """ + # Get fastq files files_list = os.listdir(in_dir) - files_list = [os.path.join(in_dir, file) for file in files_list if file.endswith('.fastq')] + files_list = [os.path.join(in_dir, file) + for file + in files_list + if file.endswith('.fastq')] return files_list - -def get_intfinder_cmd(fasta, out_dir): - """Function that takes a fasta and returns an intfinder command""" - identity_value = 0.5 - coverage_value = 0.5 - depth_value = 0.5 - intfinder_results_dir = os.path.join(out_dir, f'intfinder_results_{coverage_value}_{identity_value}_{depth_value}') - if not os.path.exists(intfinder_results_dir): - os.mkdir(intfinder_results_dir) - - intfinder_results_path = os.path.join(intfinder_results_dir, os.path.splitext(os.path.basename(fasta))[0]) - if not os.path.exists(intfinder_results_path): - os.mkdir(intfinder_results_path) - - intfinder_command = ("python3 intfinder/intfinder.py" - f" -i {fasta}" - f" -o {intfinder_results_path}" +def get_intfinder_cmd(fastq, out_dir): + """ + Function that takes a fastq and returns an intfinder command + """ + # Values range 0-1 + # For example 0.5 means 50% identity + identity = 0.5 + coverage = 0.5 + depth = 0.5 + + # New folder name with the values used + # intfinder_results_0.5_0.5_0.5 + dir = f'intfinder_results_{coverage}_{identity}_{depth}' + results_dir = os.path.join(out_dir, dir) + # Create the new sub folder + if not os.path.exists(results_dir): + os.mkdir(results_dir) + + # Get the filename to use it as a dirname + # In431_AY029772.fastq > In431_AY029772 + dir = os.path.splitext(os.path.basename(fastq))[0] + results_sub_dir = os.path.join(results_dir, dir) + # Create the new sub sub folder + if not os.path.exists(results_sub_dir): + os.mkdir(results_sub_dir) + + # Get intfinder_db location + db_loc = os.path.join(os.getcwd(), 'intfinder_db') + + # Create the intfinder command + intfinder_cmd = ("python3 intfinder/intfinder.py" + f" -i {fastq}" + f" -o {results_sub_dir}" #f" -tmp {tmp_dir}" f" -mp /usr/bin/kma" - f" -p /home/david/MyUnixWorkplace/intfinder_db" - f" -l {coverage_value}" - f" -t {identity_value}" - f" -d {depth_value}" + f" -p {db_loc}" + f" -l {coverage}" + f" -t {identity}" + f" -d {depth}" f" -x" f" -q ") + return intfinder_cmd, results_sub_dir - return intfinder_command, intfinder_results_path - - +# Main program if __name__ == '__main__': + # Menu args = parser.parse_args() + # Create the new output folder if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) + # Get fastq files files_list = get_files(args.in_dir) - for fasta in files_list: - intfinder_cmd, intfinder_results_path = get_intfinder_cmd(fasta, args.out_dir) + # For each file get an intfinder command + for fastq in files_list: + intfinder_cmd, results_sub_dir = get_intfinder_cmd(fastq, args.out_dir) #print(intfinder_cmd) + # Run the intfinder command on the shell check_call(intfinder_cmd, shell=True) diff --git a/mrs.py b/mrs.py index ac914c4..b56ad04 100644 --- a/mrs.py +++ b/mrs.py @@ -1,38 +1,55 @@ #!/usr/bin/env python3 +""" +mrs.py +Author Loaiza, K. +Comments Created: September, 2021. +""" + +# Libraries import os from subprocess import check_call # command tool +# Functions def get_dirs(in_dir): """ - Function that takes an input folder and returns a list of all the dirs inside + Function that takes an input folder and returns a list of all the dirs + inside """ dir_list = os.listdir(in_dir) dir_list = [os.path.join(in_dir, dir) for dir in dir_list] dir_list = [dir for dir in dir_list if os.path.isdir(dir)] return dir_list +# Number of metamorphic relations mrs = 3 for m in range(0, mrs): + # Number of replicas replicas = 5 for r in range(0, replicas): - cmd = f"python3 group_intfinder.py --in_dir mr{m}_test_cases --out_dir test_cases_results/mr{m}_{r}" + cmd = (f"python3 group_intfinder.py" + f" --in_dir mr{m}_test_cases" + f" --out_dir test_cases_results/mr{m}_{r}") #print(cmd) + # Run cmd on the shell check_call(cmd, shell=True) + # Get the path for the results of each metamorphic relation and replica mr_dir = os.path.join(f"test_cases_results/mr{m}_{r}", 'intfinder_results_0.5_0.5_0.5') - dir_list = get_dirs(mr_dir) for dir in dir_list: seq_file = os.path.basename(dir) observed = '' expected = seq_file.split('_')[0] #In560_HE613853 -> In560 + # Open results file res_file = os.path.join(dir, 'results_tab.tsv') with open(res_file) as infile: for line in infile: + # Ignore headline if line.startswith('Database'): pass else: observed = line.split()[1] + # Report if expected == observed: print(f'mr{m}\tpassed\t{seq_file}\t{observed}') if expected != observed: