Skip to content

Commit

Permalink
updated with comments
Browse files Browse the repository at this point in the history
  • Loading branch information
kalilamali committed Sep 11, 2021
1 parent 478c6e2 commit f5367fa
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 52 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ python3 mrs.py
python3 mrs.py > report.tsv
```
# FAQ:

group_intfinder.py uses kma's location:\
/usr/bin/kma\
if kma is not in that location please update the script.

If you get:\
ModuleNotFoundError: No module named 'tabulate'\
ModuleNotFoundError: No module named 'cgecore'\
Expand Down
33 changes: 29 additions & 4 deletions get_mr0_test_cases.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,42 @@
#! /usr/bin/env python3
"""
get_mr0_test_cases.py
Creates fastq files with dummy scores from a folder containing fasta files.
Author Loaiza, K.
Comments Created: September, 2021.
"""

# Libraries
import os
from Bio import SeqIO

# Folders
in_dir = 'validation_seqs'
out_dir = 'mr0_test_cases'
os.mkdir(out_dir)

# Create the new output folder
if not os.path.exists(out_dir):
os.mkdir(out_dir)
# Remove the old output folder
else:
os.remove(out_dir)

# Get fasta files
files_list = os.listdir(in_dir)
files_list = [file for file in files_list if file.endswith('.fasta')]

for file in files_list:
# Get filename without .fasta
# In1538_MF974580.fasta > In1538_MF974580
name = os.path.splitext(file)[0]
# Get filepath
# validation_seqs/In1538_MF974580.fasta
old_file = os.path.join(in_dir, file)
for r in SeqIO.parse(old_file, "fasta"):
r.letter_annotations["solexa_quality"] = [40] * len(r)
# Open the fasta file
for r in SeqIO.parse(old_file, 'fasta'):
# Assign dummy quality scores
r.letter_annotations['solexa_quality'] = [40] * len(r)
# Get the new filepath
new_file = os.path.join(out_dir, name)
SeqIO.write(r, f"{new_file}.fastq", "fastq")
# Save the fasta with quality scores (fastq)
SeqIO.write(r, f'{new_file}.fastq', 'fastq')
34 changes: 31 additions & 3 deletions get_mr1_test_cases.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,55 @@
#! /usr/bin/env python3
"""
get_mr1_test_cases.py
Creates fastq files with permutated reads from a folder containing fastq files.
Author Loaiza, K.
Comments Created: September, 2021.
"""

# Libraries
import os
import random
import numpy as np
from Bio import SeqIO

# Folders
in_dir = 'mr0_test_cases'
out_dir = 'mr1_test_cases'
os.mkdir(out_dir)

# Create the new output folder
if not os.path.exists(out_dir):
os.mkdir(out_dir)
# Remove the old output folder
else:
os.remove(out_dir)

# Get fastq files
files_list = os.listdir(in_dir)
files_list = [file for file in files_list if file.endswith('.fastq')]

for file in files_list:
# Get filename without .fastq
# In1538_MF974580.fastq > In1538_MF974580
name = os.path.splitext(file)[0]
# Get filepath
# mr0_test_cases/In1538_MF974580.fastq
old_file = os.path.join(in_dir, file)
for r in SeqIO.parse(old_file, "fastq"):
# Open the fastq file
for r in SeqIO.parse(old_file, 'fastq'):
# Get the lenght of the fastq sequence
P = len(r.seq)
# Get I random indices
I = 2
indices = np.sort(np.random.randint(P, size = I))
print(f'{r.id} size {P} split at {indices}')
# Get the chunks of sequences
s1 = r[:indices[0]]
s2 = r[indices[0]+1:indices[1]]
s3 = r[indices[1]+1:]
# Randomly shuffle the chunks of sequences
records = [s1, s2, s3]
random.shuffle(records)
# Get the new filepath
new_file = os.path.join(out_dir, name)
SeqIO.write(records, f"{new_file}.fastq", "fastq")
# Save the chunks of sequences in one fastq file
SeqIO.write(records, f'{new_file}.fastq', 'fastq')
33 changes: 30 additions & 3 deletions get_mr2_test_cases.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,53 @@
#! /usr/bin/env python3
"""
get_mr2_test_cases.py
Creates fastq files with duplicated reads from a folder containing fastq files.
Author Loaiza, K.
Comments Created: September 11, 2021.
"""

# Libraries
import os
import numpy as np
from Bio import SeqIO

# Folders
in_dir = 'mr0_test_cases'
out_dir = 'mr2_test_cases'
os.mkdir(out_dir)

# Create the new output folder
if not os.path.exists(out_dir):
os.mkdir(out_dir)
# Remove the old output folder
else:
os.remove(out_dir)

# Get fastq files
files_list = os.listdir(in_dir)
files_list = [file for file in files_list if file.endswith('.fastq')]

for file in files_list:
# Get filename without .fastq
# In1538_MF974580.fastq > In1538_MF974580
name = os.path.splitext(file)[0]
# Get filepath
# mr0_test_cases/In1538_MF974580.fastq
old_file = os.path.join(in_dir, file)
for r in SeqIO.parse(old_file, "fastq"):
# Open the fastq file
for r in SeqIO.parse(old_file, 'fastq'):
# Get the lenght of the fastq sequence
P = len(r.seq)
# Get I random indices
I = 1
indices = np.sort(np.random.randint(P, size = I))
print(f'{r.id} size {P} split at {indices}')
# Get the chunks of sequences
s1 = r[:indices[0]]
s2 = r[indices[0]+1:]
s3 = r[:indices[0]]
s4 = r[indices[0]+1:]
records = [s1, s2, s3, s4]
# Get the new filepath
new_file = os.path.join(out_dir, name)
SeqIO.write(records, f"{new_file}.fastq", "fastq")
# Save the chunks of sequences in one fastq file
SeqIO.write(records, f'{new_file}.fastq', 'fastq')
105 changes: 66 additions & 39 deletions group_intfinder.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,93 @@
#!/usr/bin/env python3

"""Group_intfinder.py
This program runs intfinder for a folder containing many fastq files.
Author K.Loaiza
Comments Created: Thursday, April 18, 2020"""

import argparse
"""
group_intfinder.py
Runs intfinder for a folder containing fastq files.
Author Loaiza, K.
Comments Created: April, 2020.
"""

# Libraries
import os
import argparse
import tempfile

from subprocess import check_call # command tool

# Menu
parser = argparse.ArgumentParser()
parser.add_argument('--in_dir', help='folder with input sequences')
parser.add_argument('--out_dir', help='folder to place the results')


# Functions
def get_files(in_dir):
"""Function that takes an input folder and returns a list of all the fasta
files inside"""
"""
Function that takes an input folder and returns a list of all the fastq
files inside
"""
# Get fastq files
files_list = os.listdir(in_dir)
files_list = [os.path.join(in_dir, file) for file in files_list if file.endswith('.fastq')]
files_list = [os.path.join(in_dir, file)
for file
in files_list
if file.endswith('.fastq')]
return files_list


def get_intfinder_cmd(fasta, out_dir):
"""Function that takes a fasta and returns an intfinder command"""
identity_value = 0.5
coverage_value = 0.5
depth_value = 0.5
intfinder_results_dir = os.path.join(out_dir, f'intfinder_results_{coverage_value}_{identity_value}_{depth_value}')
if not os.path.exists(intfinder_results_dir):
os.mkdir(intfinder_results_dir)

intfinder_results_path = os.path.join(intfinder_results_dir, os.path.splitext(os.path.basename(fasta))[0])
if not os.path.exists(intfinder_results_path):
os.mkdir(intfinder_results_path)

intfinder_command = ("python3 intfinder/intfinder.py"
f" -i {fasta}"
f" -o {intfinder_results_path}"
def get_intfinder_cmd(fastq, out_dir):
"""
Function that takes a fastq and returns an intfinder command
"""
# Values range 0-1
# For example 0.5 means 50% identity
identity = 0.5
coverage = 0.5
depth = 0.5

# New folder name with the values used
# intfinder_results_0.5_0.5_0.5
dir = f'intfinder_results_{coverage}_{identity}_{depth}'
results_dir = os.path.join(out_dir, dir)
# Create the new sub folder
if not os.path.exists(results_dir):
os.mkdir(results_dir)

# Get the filename to use it as a dirname
# In431_AY029772.fastq > In431_AY029772
dir = os.path.splitext(os.path.basename(fastq))[0]
results_sub_dir = os.path.join(results_dir, dir)
# Create the new sub sub folder
if not os.path.exists(results_sub_dir):
os.mkdir(results_sub_dir)

# Get intfinder_db location
db_loc = os.path.join(os.getcwd(), 'intfinder_db')

# Create the intfinder command
intfinder_cmd = ("python3 intfinder/intfinder.py"
f" -i {fastq}"
f" -o {results_sub_dir}"
#f" -tmp {tmp_dir}"
f" -mp /usr/bin/kma"
f" -p /home/david/MyUnixWorkplace/intfinder_db"
f" -l {coverage_value}"
f" -t {identity_value}"
f" -d {depth_value}"
f" -p {db_loc}"
f" -l {coverage}"
f" -t {identity}"
f" -d {depth}"
f" -x"
f" -q ")
return intfinder_cmd, results_sub_dir

return intfinder_command, intfinder_results_path


# Main program
if __name__ == '__main__':
# Menu
args = parser.parse_args()

# Create the new output folder
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir)

# Get fastq files
files_list = get_files(args.in_dir)
for fasta in files_list:
intfinder_cmd, intfinder_results_path = get_intfinder_cmd(fasta, args.out_dir)
# For each file get an intfinder command
for fastq in files_list:
intfinder_cmd, results_sub_dir = get_intfinder_cmd(fastq, args.out_dir)
#print(intfinder_cmd)
# Run the intfinder command on the shell
check_call(intfinder_cmd, shell=True)
23 changes: 20 additions & 3 deletions mrs.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,55 @@
#!/usr/bin/env python3
"""
mrs.py
Author Loaiza, K.
Comments Created: September, 2021.
"""

# Libraries
import os
from subprocess import check_call # command tool

# Functions
def get_dirs(in_dir):
"""
Function that takes an input folder and returns a list of all the dirs inside
Function that takes an input folder and returns a list of all the dirs
inside
"""
dir_list = os.listdir(in_dir)
dir_list = [os.path.join(in_dir, dir) for dir in dir_list]
dir_list = [dir for dir in dir_list if os.path.isdir(dir)]
return dir_list

# Number of metamorphic relations
mrs = 3
for m in range(0, mrs):
# Number of replicas
replicas = 5
for r in range(0, replicas):
cmd = f"python3 group_intfinder.py --in_dir mr{m}_test_cases --out_dir test_cases_results/mr{m}_{r}"
cmd = (f"python3 group_intfinder.py"
f" --in_dir mr{m}_test_cases"
f" --out_dir test_cases_results/mr{m}_{r}")
#print(cmd)
# Run cmd on the shell
check_call(cmd, shell=True)

# Get the path for the results of each metamorphic relation and replica
mr_dir = os.path.join(f"test_cases_results/mr{m}_{r}", 'intfinder_results_0.5_0.5_0.5')

dir_list = get_dirs(mr_dir)
for dir in dir_list:
seq_file = os.path.basename(dir)
observed = ''
expected = seq_file.split('_')[0] #In560_HE613853 -> In560
# Open results file
res_file = os.path.join(dir, 'results_tab.tsv')
with open(res_file) as infile:
for line in infile:
# Ignore headline
if line.startswith('Database'):
pass
else:
observed = line.split()[1]
# Report
if expected == observed:
print(f'mr{m}\tpassed\t{seq_file}\t{observed}')
if expected != observed:
Expand Down

0 comments on commit f5367fa

Please sign in to comment.