Skip to content

Commit

Permalink
Added structure for latest Nanosim
Browse files Browse the repository at this point in the history
  • Loading branch information
AlphaSquad committed Jul 5, 2021
1 parent 836f559 commit 3dab265
Show file tree
Hide file tree
Showing 19 changed files with 1,135 additions and 6 deletions.
110 changes: 109 additions & 1 deletion scripts/ReadSimulationWrapper/readsimulationwrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def _simulate_reads(self, dict_id_abundance, dict_id_file_path, factor, director
abundance = dict_id_abundance[genome_id]
if abundance == 0:
continue
if self._label == "ReadSimulationWgsim" or self._label == "ReadSimulationNanosim":
if self._label == "ReadSimulationWgsim" or "ReadSimulationNanosim" in self._label:
# name "fold_coverage" is misleading for wgsim/nanosim, which use number of reads as input
fold_coverage = int(round(abundance * factor / self._fragment_size_mean))
else:
Expand Down Expand Up @@ -421,6 +421,113 @@ def _get_sys_cmd(self, file_path_input, fold_coverage, file_path_output_prefix):
cmd = "{exe} {args}".format(exe=self._file_path_executable, args=" ".join(arguments))
return cmd

class ReadSimulationNanosim3(ReadSimulationWrapper):
"""
Simulate long reads(Oxford Nanopore) using nanosim
"""

_label = "ReadSimulationNanosim"

def __init__(self, file_path_executable, directory_error_profiles, **kwargs):
super(ReadSimulationNanosim3, self).__init__(file_path_executable, **kwargs)
self._profile = 'standard'

def simulate(
self, file_path_distribution, file_path_genome_locations, directory_output,
total_size, profile, fragment_size_mean, fragment_size_standard_deviation):
"""
Simulate reads based on a given sample distribution
@param file_path_distribution: File genome id associated with the abundance of a genome
@type file_path_distribution: str | unicode
@param file_path_genome_locations: File genome id associated with the file path of a genome
@type file_path_genome_locations: str | unicode
@param directory_output: Directory for the sam and fastq files output
@type directory_output: str | unicode
@param total_size: Size of sample in base pairs
@type total_size: int
@param profile: wgsim options: 'errorfree', 'standard'
@type profile: str | unicode
@param fragment_size_mean: Size of the fragment of which the ends are used as reads in base pairs
@type fragment_size_mean: int
@param fragment_size_standard_deviation: Standard deviation of the fragment size in base pairs.
@type fragment_size_standard_deviation: int
"""
assert isinstance(total_size, (float, int)), "Expected natural digit"
assert isinstance(fragment_size_mean, int), "Expected natural digit"
assert isinstance(fragment_size_standard_deviation, int), "Expected natural digit"
assert total_size > 0, "Total size needs to be a positive number"
assert fragment_size_mean > 0, "Mean fragments size needs to be a positive number"
assert fragment_size_standard_deviation > 0, "Fragment size standard deviation needs to be a positive number"
assert self.validate_dir(directory_output)
if profile is not None:
self._profile = profile

dict_id_abundance = self._read_distribution_file(file_path_distribution)
dict_id_file_path = self._read_genome_location_file(file_path_genome_locations)
locs = set(dict_id_abundance.keys()) - set(dict_id_file_path.keys())
assert set(dict_id_file_path.keys()).issuperset(dict_id_abundance.keys()), "Some ids do not have a genome location %s" % locs

self._fragment_size_mean = 7408 # nanosim does not use fragment size, this is for getting the correct number of reads
# this value has been calculated using the script tools/nanosim_profile/get_mean from the values in nanosim_profile
factor = total_size # nanosim needs number of reads as input not coverage

self._logger.debug("Multiplication factor: {}".format(factor))
self._simulate_reads(dict_id_abundance, dict_id_file_path, factor, directory_output)
self._sam_from_reads(directory_output, dict_id_file_path)

def _sam_from_reads(self, directory_output, dict_id_file_path):
files = os.listdir(directory_output)
id_to_cigar_map = {}
for f in files:
if f.endswith("_error_profile"): # these are the introduced errors by Nanosim
prefix = f.rsplit("_",2)[0] # get basename
id_to_cigar_map[prefix] = sam_from_reads.get_cigars_nanosim(os.path.join(directory_output,f))
os.remove(os.path.join(directory_output,f)) # error_profile files are huge (TODO temporary requirement is still high)
for f in files:
if f.endswith("_reads.fasta"):
prefix = f.rsplit(".",1)[0].rsplit("_",1)[0]
read_file = os.path.join(directory_output,f)
cigars = id_to_cigar_map[prefix]
reference_path = dict_id_file_path[prefix]
sam_from_reads.write_sam(read_file, cigars, reference_path, prefix)
sam_from_reads.convert_fasta(read_file)
os.remove(os.path.join(directory_output,f)) # do not store read file twice

def _get_sys_cmd(self, file_path_input, fold_coverage, file_path_output_prefix):
"""
Build system command to be run.
@param file_path_input: Path to genome fasta file
@type file_path_input: str | unicode
@param fold_coverage: coverage of a genome
@type fold_coverage: int | float
@param file_path_output_prefix: Output prefix used by art illumina
@type file_path_output_prefix: str | unicode
@return: System command to run art illumina
@rtype: str | unicode
"""
assert self.validate_file(file_path_input)
assert isinstance(fold_coverage, (int, float))
assert self.validate_dir(file_path_output_prefix, only_parent=True)

arguments = [
'genome',
'-n', str(fold_coverage), # rename this, because its not the fold_coverage for wgsim
'-rg', file_path_input,
'-o', file_path_output_prefix,
'-c', "tools/nanosim_profile/training",
'--seed', str(self._get_seed() % 2**32 - 1), # nanosim seed cannot be > 2**32 -1
'-dna_type linear'
]

if self._logfile:
arguments.append(">> '{}'".format(self._logfile))

cmd = "{exe} {args}".format(exe=self._file_path_executable, args=" ".join(arguments))
return cmd

class ReadSimulationNanosim(ReadSimulationWrapper):
"""
Simulate long reads(Oxford Nanopore) using nanosim
Expand Down Expand Up @@ -917,5 +1024,6 @@ def _get_sys_cmd(self, file_path_input, fold_coverage, file_path_output_prefix):
"art": ReadSimulationArt,
"wgsim": ReadSimulationWgsim,
"nanosim": ReadSimulationNanosim,
"nanosim3": ReadSimulationNanosim3,
"pbsim": ReadSimulationPBsim
}
6 changes: 3 additions & 3 deletions scripts/argumenthandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,16 +421,16 @@ def _check_read_simulation_values(self):
self._logger.error("No error profile is given, check manual for possible choices!")
self._valid_arguments = False

if self._fragments_size_mean_in_bp is None and self._read_simulator_type != 'nanosim':
if self._fragments_size_mean_in_bp is None and 'nanosim' not in self._read_simulator_type:
self._logger.error("'-fmean' For the simulation with 'art', 'pbsim' and 'wgsim' the fragment size mean is required!")
self._valid_arguments = False
elif not self._validator.validate_number(self._fragments_size_mean_in_bp, minimum=1, key='-fmean') and self._read_simulator_type != 'nanosim':
elif not self._validator.validate_number(self._fragments_size_mean_in_bp, minimum=1, key='-fmean') and 'nanosim' not in self._read_simulator_type:
self._valid_arguments = False

if self._fragment_size_standard_deviation_in_bp is None and self._read_simulator_type != 'nanosim':
self._logger.error("'-fsd' For the simulation with 'art', 'pbsim' and 'wgsim' a standard_deviation of the fragments size is required!")
self._valid_arguments = False
elif not self._validator.validate_number(self._fragment_size_standard_deviation_in_bp, minimum=1, key='-fsd') and self._read_simulator_type != 'nanosim':
elif not self._validator.validate_number(self._fragment_size_standard_deviation_in_bp, minimum=1, key='-fsd') and 'nanosim' not in self._read_simulator_type:
self._logger.error(
"'-fsd' The standard_deviation of the fragments size must be a positive number: '{}'".format(
self._fragment_size_standard_deviation_in_bp))
Expand Down
3 changes: 2 additions & 1 deletion scripts/create_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def get_sequencing_technology(read_simulator):
"art" : "Illumina HiSeq2500",
"wgsim" : "Illumina",
"pbsim" : "Pacific Biosciences",
"nanosim" : "Oxford Nanopore"
"nanosim" : "Oxford Nanopore",
"nanosim" : "Oxford Nanopore, R9 chemistry"
}
return dict_of_simulators[read_simulator]

Expand Down
2 changes: 1 addition & 1 deletion scripts/defaultvalues.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class DefaultValues(DefaultLogging):
# ############
# [read_simulator]
# ############
_valid_read_simulators = ['art','wgsim','pbsim','nanosim']
_valid_read_simulators = ['art','wgsim','pbsim','nanosim','nanosim3']
_sample_size_in_base_pairs = None

_read_simulator_type = None
Expand Down
Binary file added tools/nanosim_profile/training_aligned_reads.pkl
Binary file not shown.
Binary file added tools/nanosim_profile/training_aligned_region.pkl
Binary file not shown.
2 changes: 2 additions & 0 deletions tools/nanosim_profile/training_chimeric_info
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Mean segments for each aligned read: 1.0256029639767317
Shrinkage rate (beta): 0.7315392728540919
8 changes: 8 additions & 0 deletions tools/nanosim_profile/training_error_markov_model
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
succedent mis ins del
start 0.268584806345365 0.3059561844692852 0.42545900918534985
mis 0.3416622418046852 0.2712574592048127 0.3870802989905021
ins 0.35306821604429006 0.29597698921899357 0.3509547947367163
del 0.34934425546259606 0.2442284211332606 0.40642732340414334
mis0 0.0 0.48214285714285715 0.5178571428571429
ins0 0.9999978113643295 0.0 2.188635670422638e-06
del0 0.999999804689931 1.9531006902772747e-07 0.0
4 changes: 4 additions & 0 deletions tools/nanosim_profile/training_error_rate.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Mismatch rate: 0.041274394579310175
Insertion rate: 0.0315868993750412
Deletion rate: 0.04876426334524471
Total error rate: 0.1216255572995961
Loading

0 comments on commit 3dab265

Please sign in to comment.