Added structure for latest Nanosim

25280841 · Jul 5, 2021 · 3dab265 · 3dab265
1 parent 836f559
commit 3dab265
Show file tree

Hide file tree

Showing 19 changed files with 1,135 additions and 6 deletions.
diff --git a/scripts/ReadSimulationWrapper/readsimulationwrapper.py b/scripts/ReadSimulationWrapper/readsimulationwrapper.py
@@ -264,7 +264,7 @@ def _simulate_reads(self, dict_id_abundance, dict_id_file_path, factor, director
             abundance = dict_id_abundance[genome_id]
             if abundance == 0:
                 continue
-            if self._label == "ReadSimulationWgsim" or self._label == "ReadSimulationNanosim":
+            if self._label == "ReadSimulationWgsim" or "ReadSimulationNanosim" in self._label:
                 # name "fold_coverage" is misleading for wgsim/nanosim, which use number of reads as input
                 fold_coverage = int(round(abundance * factor / self._fragment_size_mean))
             else:
@@ -421,6 +421,113 @@ def _get_sys_cmd(self, file_path_input, fold_coverage, file_path_output_prefix):
         cmd = "{exe} {args}".format(exe=self._file_path_executable, args=" ".join(arguments))
         return cmd
 
+class ReadSimulationNanosim3(ReadSimulationWrapper):
+    """
+    Simulate long reads(Oxford Nanopore) using nanosim
+    """
+
+    _label = "ReadSimulationNanosim"
+
+    def __init__(self, file_path_executable, directory_error_profiles, **kwargs):
+        super(ReadSimulationNanosim3, self).__init__(file_path_executable, **kwargs)
+        self._profile = 'standard'
+
+    def simulate(
+        self, file_path_distribution, file_path_genome_locations, directory_output,
+        total_size, profile, fragment_size_mean, fragment_size_standard_deviation):
+        """
+        Simulate reads based on a given sample distribution
+
+        @param file_path_distribution: File genome id associated with the abundance of a genome
+        @type file_path_distribution: str | unicode
+        @param file_path_genome_locations: File genome id associated with the file path of a genome
+        @type file_path_genome_locations: str | unicode
+        @param directory_output: Directory for the sam and fastq files output
+        @type directory_output: str | unicode
+        @param total_size: Size of sample in base pairs
+        @type total_size: int 
+        @param profile: wgsim options: 'errorfree', 'standard'
+        @type profile: str | unicode
+        @param fragment_size_mean: Size of the fragment of which the ends are used as reads in base pairs
+        @type fragment_size_mean: int 
+        @param fragment_size_standard_deviation: Standard deviation of the fragment size in base pairs.
+        @type fragment_size_standard_deviation: int 
+        """
+        assert isinstance(total_size, (float, int)), "Expected natural digit"
+        assert isinstance(fragment_size_mean, int), "Expected natural digit"
+        assert isinstance(fragment_size_standard_deviation, int), "Expected natural digit"
+        assert total_size > 0, "Total size needs to be a positive number"
+        assert fragment_size_mean > 0, "Mean fragments size needs to be a positive number"
+        assert fragment_size_standard_deviation > 0, "Fragment size standard deviation needs to be a positive number"
+        assert self.validate_dir(directory_output)
+        if profile is not None:
+            self._profile = profile
+
+        dict_id_abundance = self._read_distribution_file(file_path_distribution)
+        dict_id_file_path = self._read_genome_location_file(file_path_genome_locations)
+        locs = set(dict_id_abundance.keys()) - set(dict_id_file_path.keys())
+        assert set(dict_id_file_path.keys()).issuperset(dict_id_abundance.keys()), "Some ids do not have a genome location %s" % locs
+
+        self._fragment_size_mean = 7408 # nanosim does not use fragment size, this is for getting the correct number of reads
+        # this value has been calculated using the script tools/nanosim_profile/get_mean from the values in nanosim_profile
+        factor = total_size  # nanosim needs number of reads as input not coverage
+
+        self._logger.debug("Multiplication factor: {}".format(factor))
+        self._simulate_reads(dict_id_abundance, dict_id_file_path, factor, directory_output)
+        self._sam_from_reads(directory_output, dict_id_file_path)
+
+    def _sam_from_reads(self, directory_output, dict_id_file_path):
+        files = os.listdir(directory_output)
+        id_to_cigar_map = {}
+        for f in files:
+            if f.endswith("_error_profile"): # these are the introduced errors by Nanosim
+                prefix = f.rsplit("_",2)[0] # get basename
+                id_to_cigar_map[prefix] = sam_from_reads.get_cigars_nanosim(os.path.join(directory_output,f))
+                os.remove(os.path.join(directory_output,f)) # error_profile files are huge (TODO temporary requirement is still high)
+        for f in files:
+            if f.endswith("_reads.fasta"):
+                prefix = f.rsplit(".",1)[0].rsplit("_",1)[0]
+                read_file = os.path.join(directory_output,f)
+                cigars = id_to_cigar_map[prefix]
+                reference_path = dict_id_file_path[prefix]
+                sam_from_reads.write_sam(read_file, cigars, reference_path, prefix)
+                sam_from_reads.convert_fasta(read_file)
+                os.remove(os.path.join(directory_output,f)) # do not store read file twice
+
+    def _get_sys_cmd(self, file_path_input, fold_coverage, file_path_output_prefix):
+        """
+        Build system command to be run.
+
+        @param file_path_input: Path to genome fasta file
+        @type file_path_input: str | unicode
+        @param fold_coverage: coverage of a genome
+        @type fold_coverage: int  | float
+        @param file_path_output_prefix: Output prefix used by art illumina
+        @type file_path_output_prefix: str | unicode
+
+        @return: System command to run art illumina
+        @rtype: str | unicode
+        """
+        assert self.validate_file(file_path_input)
+        assert isinstance(fold_coverage, (int, float))
+        assert self.validate_dir(file_path_output_prefix, only_parent=True)
+
+        arguments = [
+            'genome',
+            '-n', str(fold_coverage),  # rename this, because its not the fold_coverage for wgsim
+            '-rg', file_path_input,
+            '-o', file_path_output_prefix,
+            '-c', "tools/nanosim_profile/training",
+            '--seed', str(self._get_seed() % 2**32 - 1), # nanosim seed cannot be > 2**32 -1
+            '-dna_type linear'
+            ]
+
+        if self._logfile:
+            arguments.append(">> '{}'".format(self._logfile))
+
+        cmd = "{exe} {args}".format(exe=self._file_path_executable, args=" ".join(arguments))
+        return cmd
+
 class ReadSimulationNanosim(ReadSimulationWrapper):
     """
     Simulate long reads(Oxford Nanopore) using nanosim
@@ -917,5 +1024,6 @@ def _get_sys_cmd(self, file_path_input, fold_coverage, file_path_output_prefix):
     "art": ReadSimulationArt,
     "wgsim": ReadSimulationWgsim,
     "nanosim": ReadSimulationNanosim,
+    "nanosim3": ReadSimulationNanosim3,
     "pbsim": ReadSimulationPBsim
 }
diff --git a/scripts/argumenthandler.py b/scripts/argumenthandler.py
@@ -421,16 +421,16 @@ def _check_read_simulation_values(self):
                 self._logger.error("No error profile is given, check manual for possible choices!")
                 self._valid_arguments = False
 
-            if self._fragments_size_mean_in_bp is None and self._read_simulator_type != 'nanosim':
+            if self._fragments_size_mean_in_bp is None and 'nanosim' not in self._read_simulator_type:
                 self._logger.error("'-fmean' For the simulation with 'art', 'pbsim' and 'wgsim' the fragment size mean is required!")
                 self._valid_arguments = False
-            elif not self._validator.validate_number(self._fragments_size_mean_in_bp, minimum=1, key='-fmean') and self._read_simulator_type != 'nanosim':
+            elif not self._validator.validate_number(self._fragments_size_mean_in_bp, minimum=1, key='-fmean') and 'nanosim' not in self._read_simulator_type:
                 self._valid_arguments = False
 
             if self._fragment_size_standard_deviation_in_bp is None and self._read_simulator_type != 'nanosim':
                 self._logger.error("'-fsd' For the simulation with 'art', 'pbsim' and 'wgsim' a standard_deviation of the fragments size is required!")
                 self._valid_arguments = False
-            elif not self._validator.validate_number(self._fragment_size_standard_deviation_in_bp, minimum=1, key='-fsd') and self._read_simulator_type != 'nanosim':
+            elif not self._validator.validate_number(self._fragment_size_standard_deviation_in_bp, minimum=1, key='-fsd') and 'nanosim' not in self._read_simulator_type:
                 self._logger.error(
                     "'-fsd' The standard_deviation of the fragments size must be a positive number: '{}'".format(
                         self._fragment_size_standard_deviation_in_bp))

diff --git a/scripts/create_metadata.py b/scripts/create_metadata.py
@@ -74,7 +74,8 @@ def get_sequencing_technology(read_simulator):
         "art" : "Illumina HiSeq2500",
         "wgsim" : "Illumina",
         "pbsim" : "Pacific Biosciences",
-        "nanosim" : "Oxford Nanopore"
+        "nanosim" : "Oxford Nanopore",
+        "nanosim" : "Oxford Nanopore, R9 chemistry"
     }
     return dict_of_simulators[read_simulator]
 

diff --git a/scripts/defaultvalues.py b/scripts/defaultvalues.py
@@ -54,7 +54,7 @@ class DefaultValues(DefaultLogging):
     # ############
     # [read_simulator]
     # ############
-    _valid_read_simulators = ['art','wgsim','pbsim','nanosim']
+    _valid_read_simulators = ['art','wgsim','pbsim','nanosim','nanosim3']
     _sample_size_in_base_pairs = None
 
     _read_simulator_type = None

diff --git a/tools/nanosim_profile/training_aligned_reads.pkl b/tools/nanosim_profile/training_aligned_reads.pkl
diff --git a/tools/nanosim_profile/training_aligned_region.pkl b/tools/nanosim_profile/training_aligned_region.pkl
diff --git a/tools/nanosim_profile/training_chimeric_info b/tools/nanosim_profile/training_chimeric_info
@@ -0,0 +1,2 @@
+Mean segments for each aligned read:	1.0256029639767317
+Shrinkage rate (beta):	0.7315392728540919
diff --git a/tools/nanosim_profile/training_error_markov_model b/tools/nanosim_profile/training_error_markov_model
@@ -0,0 +1,8 @@
+succedent 	mis	ins	del
+start	0.268584806345365	0.3059561844692852	0.42545900918534985
+mis	0.3416622418046852	0.2712574592048127	0.3870802989905021
+ins	0.35306821604429006	0.29597698921899357	0.3509547947367163
+del	0.34934425546259606	0.2442284211332606	0.40642732340414334
+mis0	0.0	0.48214285714285715	0.5178571428571429
+ins0	0.9999978113643295	0.0	2.188635670422638e-06
+del0	0.999999804689931	1.9531006902772747e-07	0.0
diff --git a/tools/nanosim_profile/training_error_rate.tsv b/tools/nanosim_profile/training_error_rate.tsv
@@ -0,0 +1,4 @@
+Mismatch rate:	0.041274394579310175
+Insertion rate:	0.0315868993750412
+Deletion rate:	0.04876426334524471
+Total error rate:	0.1216255572995961
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Mean segments for each aligned read: 1.0256029639767317
		Shrinkage rate (beta): 0.7315392728540919