Merge branch 'CW-4992' into 'dev'

Update spectre [CW-4992] Closes CW-4992 See merge request epi2melabs/workflows/wf-human-variation!343
epi2me-labs · Nov 13, 2024 · 882cfd6 · 882cfd6
2 parents 64db8c9 + ae58846
commit 882cfd6
Show file tree

Hide file tree

Showing 13 changed files with 81 additions and 81 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -87,6 +87,7 @@ docker-run:
                 "wf-human-sv-phase",
                 "wf-human-cnv-spectre",
                 "wf-human-cnv-spectre_args",
+                "wf-human-cnv-spectre_hg19",
                 "wf-human-phase_all",
                 "wf-human-phase_all_lp",
                 "downsample",
@@ -428,6 +429,13 @@ docker-run:
               NF_IGNORE_PROCESSES: "get_coverage,getVersions,getParams,failedQCReport,makeAlignmentReport,phase_gvcf"
               NF_WORKFLOW_OPTS: "--snp false --sv false --mod false --cnv --str false --bam ${CI_PROJECT_NAME}/data/spectre_demo/demo.bam --ref ${CI_PROJECT_NAME}/data/spectre_demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --bam_min_coverage 0.00001 --spectre_args '--min-cnv-len 100000' --override_basecaller_cfg [email protected]"
 
+        - if: $MATRIX_NAME == "wf-human-cnv-spectre_hg19"
+          variables:
+              NF_BEFORE_SCRIPT: "mkdir -p ${CI_PROJECT_NAME}/data/ && wget -q -O ${CI_PROJECT_NAME}/data/demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-spectre/spectre_demo_hg19.tar.gz && tar -xzvf ${CI_PROJECT_NAME}/data/demo_data.tar.gz -C ${CI_PROJECT_NAME}/data/"
+              NF_PROCESS_FILES: "modules/local/wf-human-cnv.nf"
+              NF_IGNORE_PROCESSES: "get_coverage,getVersions,getParams,failedQCReport,makeAlignmentReport,phase_gvcf"
+              NF_WORKFLOW_OPTS: "--snp false --sv false --mod false --cnv --str false --bam ${CI_PROJECT_NAME}/data/spectre_demo_hg19/demo.bam --ref ${CI_PROJECT_NAME}/data/spectre_demo_hg19/human_g1k_v37.fasta.gz --bam_min_coverage 0.00001 --override_basecaller_cfg [email protected]"
+
         - if: $MATRIX_NAME == "gzref-bam-mod-hap"
           variables:
               NF_BEFORE_SCRIPT: "mkdir -p ${CI_PROJECT_NAME}/data/ && wget -q -O ${CI_PROJECT_NAME}/data/demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-somatic-variation/wf-somatic-variation-demo.tar.gz && \

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [unreleased]
 ### Changed
-- Reconciled workflow with wf-template v5.3.0
+- Reconciled workflow with wf-template v5.3.1
+- Update Spectre to v0.3.2, which includes support for calling CNVs in hg19 data.
 - ClinVar version in SnpEff container updated to version 20241103.
 ### Added
 - `--spectre_args` may be used to provide custom arguments to the Spectre process.

diff --git a/README.md b/README.md
@@ -141,7 +141,7 @@ input_reads.bam     ─── input_directory
 |--------------------------|------|-------------|------|---------|
 | sv | boolean | Call for structural variants. | If this option is selected, structural variant calling will be carried out using Sniffles2. | False |
 | snp | boolean | Call for small variants | If this option is selected, small variant calling will be carried out using Clair3. | False |
-| cnv | boolean | Call for copy number variants. | If this option is selected, copy number variant calling will be carried out with either Spectre (default) or QDNAseq. To use QDNAseq instead of Spectre, use the option `--use_qdnaseq`. Spectre is only compatible with genome build hg38, and if QDNAseq is used, it is only compatible with genome builds hg37 and hg38. | False |
+| cnv | boolean | Call for copy number variants. | If this option is selected, copy number variant calling will be carried out with either Spectre (default) or QDNAseq. To use QDNAseq instead of Spectre, use the option --use_qdnaseq. | False |
 | str | boolean | Enable Straglr to genotype STR expansions. | If this option is selected, genotyping of STR expansions will be carried out using Straglr. This sub-workflow is only compatible with genome build hg38. | False |
 | mod | boolean | Enable output of modified calls to a bedMethyl file [requires input BAM with Ml and Mm tags] | This option is automatically selected and aggregation of modified calls with be carried out using modkit if Ml and Mm tags are found. Disable this option to prevent output of a bedMethyl file. | False |
 
@@ -167,7 +167,7 @@ input_reads.bam     ─── input_directory
 
 | Nextflow parameter name  | Type | Description | Help | Default |
 |--------------------------|------|-------------|------|---------|
-| use_qdnaseq | boolean | Use QDNAseq for CNV calling. | Set this to true to use QDNASeq for CNV calling instead of Spectre. QDNAseq is better suited to shorter reads such as those generated from adaptive sampling experiments. | False |
+| use_qdnaseq | boolean | Use QDNAseq for CNV calling. | Set this to true to use QDNAseq for CNV calling instead of Spectre. QDNAseq is better suited to shorter reads such as those generated from adaptive sampling experiments. | False |
 | qdnaseq_bin_size | integer | Bin size for QDNAseq in kbp. | Pre-computed bin annotations are available for a range of bin sizes. Larger sizes reduce noise, however this may result in reduced sensitivity. | 500 |
 
 
@@ -193,7 +193,7 @@ input_reads.bam     ─── input_directory
 | GVCF | boolean | Enable to output a gVCF file in addition to the VCF outputs (experimental). | By default the the workflow outputs a VCF file containing only records where a variant has been detected. Enabling this option will output additionally a gVCF with records spanning all reference positions regardless of whether a variant was detected in the sample. | False |
 | downsample_coverage | boolean | Downsample the coverage to along the genome. | This options will trigger a downsampling of the read alignments to the target coverage specified by --downsample_coverage_target. Downsampling will make the workflow run faster but could lead to non-deterministic variant calls. | False |
 | downsample_coverage_target | number | Average coverage or reads to use for the analyses. | This options will set the target coverage for the downsampling stage, if downsampling has been enabled. | 60 |
-| output_xam_fmt | string | Desired file format of alignment files created by alignment and phasing. | This setting controls the file format of (1) alignment files created by aligning or re-aligning an input BAM and (2) alignment files with haplotag information created during phasing of an input BAM. If using QDNASeq for CNV calling, the setting will be ignored for alignment or realignment as QDNASeq requires BAM input. | cram |
+| output_xam_fmt | string | Desired file format of alignment files created by alignment and phasing. | This setting controls the file format of (1) alignment files created by aligning or re-aligning an input BAM and (2) alignment files with haplotag information created during phasing of an input BAM. If using QDNAseq for CNV calling, the setting will be ignored for alignment or realignment as QDNAseq requires BAM input. | cram |
 | modkit_args | string | The additional options for modkit. | This is an advanced option to allow running modkit with custom settings. The arguments specified in this option will fully override all options set by the workflow. To provide custom arguments to [modkit pileup](https://nanoporetech.github.io/modkit/advanced_usage.html#pileup) from command line proceed as follows: `--modkit_args="--preset traditional"` |  |
 | sniffles_args | string | Additional command line arguments to pass to the Sniffles2 process | The additional command line arguments will be passed directly to [Sniffles2](https://github.com/fritzsedlazeck/Sniffles/tree/v2.0.7); ensure to use the right commands for the version and from command line provide them as follows: `--sniffles_args="--non-germline"`. |  |
 | spectre_args | string | Additional command line arguments to pass to the Spectre process | The additional command line arguments will be passed directly to [Spectre](https://github.com/epi2me-labs/ont-spectre); from the command line provide them as follows: `--spectre_args="--min-cnv-len 80000"`. |  |
@@ -314,7 +314,7 @@ Haplotype-resolved aggregated counts of modified bases can be obtained with the
 
 ### 6a. Copy number variants (CNV) calling with Spectre
 
-CNV calling is performed using an ONT implementation of [Spectre](https://github.com/epi2me-labs/ont-spectre/) using the `--cnv` flag. Spectre is the default CNV caller in the workflow and is compatible with hg38/GRCh38. The output of this workflow is a VCF of CNV calls annotated with SnpEff. Advanced users may wish to override default parameters using `--spectre_args`, e.g. `--spectre_args "--min-cnv-len 100000"`, however we don't recommend setting this to less than N50 * 5 to avoid false positive calls.
+CNV calling is performed using an ONT implementation of [Spectre](https://github.com/epi2me-labs/ont-spectre/) using the `--cnv` flag. Spectre is the default CNV caller in the workflow and is compatible with genome builds hg19/GRCh37 or hg38/GRCh38. The output of this workflow is a VCF of CNV calls annotated with SnpEff. Advanced users may wish to override default parameters using `--spectre_args`, e.g. `--spectre_args "--min-cnv-len 100000"`. We don't recommend setting `--min-cnv-len` to less than N50 * 5 to avoid false positive calls.
 
 ### 6b. Copy number variants (CNV) calling with QDNASeq
 
@@ -356,7 +356,7 @@ Some of the sub-workflows in wf-human-variation are restricted to certain genome
 
 |    Genome    | `--snp`  | `--sv`  | `--mod` | `--cnv` | `--cnv --use_qdnaseq` | `--str` | `--annotation false` | `--include_all_ctgs` |
 |--------------|----------|---------|---------|---------|-----------------------|---------|----------------------|----------------------|
-| hg19/GRCh37  | &check;  | &check; | &check; |         |        &check;        |         |         \*           |                      |
+| hg19/GRCh37  | &check;  | &check; | &check; | &check; |        &check;        |         |         \*           |                      |
 | hg38/GRCh38  | &check;  | &check; | &check; | &check; |        &check;        | &check; |         \*           |                      |
 | Other human  | &check;  | &check; | &check; |         |                       |         |       &check;        |                      |
 | Non human    | &check;  | &check; | &check; |         |                       |         |       &check;        |       &check;        |

diff --git a/base.config b/base.config
@@ -7,7 +7,7 @@ params {
         e2l_mod_tag = "shaa7bf2b62946eeb7646b9b9d60b892edfc3b3a52c"
         cnv_tag = "sha428cb19e51370020ccf29ec2af4eead44c6a17c2"
         str_tag = "shadd2f2963fe39351d4e0d6fa3ca54e1064c6ec057"
-        spectre_tag = "sha49a9fe474da9860f84f08f17f137b47a010b1834"
+        spectre_tag = "sha42472d37a5a992c3ee27894a23dce5e2fff66d27"
         snpeff_tag = "shab01c188f11ca9ce53d186fe22111eeac52409523"
         common_sha = "shad28e55140f75a68f59bbecc74e880aeab16ab158"
     }

diff --git a/bin/get_genome.py b/bin/get_genome.py
@@ -97,27 +97,35 @@ def get_genome(sizes):
     return ""
 
 
-def check_genome(genome_build, str_flag, cnv, use_qdnaseq):
+def check_genome(genome_build, str_flag):
     """Determine if genome is suitable for this workflow."""
     bad_genome = False
-    extra_msg_context = ""
+    extra_msg_context = (
+        "#####################################################################\n"
+        "# INPUT DATA PROBLEM\n"
+        "The genome build detected in the BAM is not compatible with this\n"
+        "workflow as it does not appear to be hg19/GRCh37 or hg38/GRCh38.\n"
+        "If you are trying to run this workflow with non-human data, please\n"
+        "consult the 'Genome compatibility and running the workflow on\n"
+        "non-human genomes' section of the README.\n"
+        "####################################################################\n"
+    )
     if not genome_build:
         bad_genome = True
-    elif (str_flag or (cnv and not use_qdnaseq)) and genome_build != "hg38":
+    elif str_flag and genome_build != "hg38":
         bad_genome = True
         extra_msg_context = (
-            "########################################################################\n"
+            "#####################################################################\n"
             "# INPUT DATA PROBLEM\n"
             "The genome build detected in the BAM is not compatible with this\n"
             "workflow.\n"
-            f"Detected genome: {genome_build}, but genotyping STRs and calling\n"
-            "CNVs with Spectre can only be performed when aligned to build 38.\n"
-            "To perform STR or CNV calling, you need to run the workflow providing\n"
-            "the following reference genome:\n\n"
+            f"Detected genome: {genome_build}, but genotyping STRs can only be\n"
+            "performed when aligned to build 38.\n"
+            "To perform STR calling, you need to run the workflow providing the\n"
+            "following reference genome to the --ref parameter:\n\n"
             f"{HG38_URL}\n\n"
-            "Alternatively, disable STR and CNV calling by setting --str false\n"
-            "and --cnv false.\n"
-            "########################################################################\n"
+            "Alternatively, disable STR calling by setting --str false.\n"
+            "####################################################################\n"
         )
     return (bad_genome, extra_msg_context)
 
@@ -139,24 +147,12 @@ def main():
         default=False,
         help="STR flag"
     )
-    parser.add_argument(
-        '--cnv',  action='store_true',
-        default=False,
-        help="CNV flag"
-    )
-    parser.add_argument(
-        '--use_qdnaseq',  action='store_true',
-        default=False,
-        help="QDNAseq flag"
-    )
     args = parser.parse_args()
 
     all_sizes = chromosome_sizes(args.chr_counts)
 
     genome_build = get_genome(all_sizes)
-    bad_genome, extra_msg_context = check_genome(
-        genome_build, args.str_, args.cnv, args.use_qdnaseq
-    )
+    bad_genome, extra_msg_context = check_genome(genome_build, args.str_)
 
     # explode on bad genome
     if bad_genome:

diff --git a/bin/workflow_glue/report_cnv_spectre.py b/bin/workflow_glue/report_cnv_spectre.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python
 """Plot Spectre CNVs."""
 
-import json
-
 from dominate.tags import a, p
 from ezcharts.components.reports.labs import LabsReport
 from ezcharts.components.theme import LAB_head_resources
@@ -44,8 +42,8 @@ def argparser():
         help="Workflow version",
     )
     parser.add_argument(
-        "--karyotype_json",
-        help="JSON file containing karyotype"
+        "--karyotype",
+        help="Text file containing karyotype"
     )
     parser.add_argument(
         '-o', '--output', required=True, dest="output_report",
@@ -61,16 +59,12 @@ def make_report(params, versions, cnv_df, args):
         WORKFLOW_NAME, params, versions, args.workflow_version,
         head_resources=[*LAB_head_resources])
 
-    karyotype = generate_karyotype(args.karyotype_json)
-
-    with report.add_section(
-            'Introduction', 'Intro'):
-        spectre_url = "https://github.com/fritzsedlazeck/Spectre/tree/ont-dev"
-        p(
-            "This report contains CNVs detected using an ",
-            a("ONT fork of Spectre", href=spectre_url),
-            ", as part of the wf-human-variation workflow."
-        )
+    with open(args.karyotype) as karyotype_file:
+        line = karyotype_file.readline().rstrip()
+        if line:
+            karyotype = line
+        else:
+            karyotype = "Not applicable"
 
     if not cnv_df.empty:
         # get the totals
@@ -79,16 +73,27 @@ def make_report(params, versions, cnv_df, args):
         total_dups = type_counts.get('DUP', 0)
         total_dels = type_counts.get('DEL', 0)
 
-        with report.main_content:
+        spectre_url = "https://github.com/nanoporetech/ont-spectre"
+
+        with report.add_section('At a glance', 'Summary'):
+            p(
+                "This section displays a summary of the CNVs detected using an ",
+                a("ONT implementation of Spectre", href=spectre_url),
+                ", as part of the wf-human-variation workflow."
+            )
             Stats(
                 columns=4,
                 items=[
-                    (karyotype, 'Predicted karyotype'),
+                    (karyotype, 'Predicted karyotype*'),
                     (total_cnv, 'Total no. of CNVs'),
                     (total_dups, 'Total no. of duplications'),
                     (total_dels, 'Total no. of deletions')
                 ])
 
+            p("""*The predicted karyotype is "Not applicable" if chromosomes X and
+                Y are absent from the input data or have insufficient coverage.
+                """)
+
         with report.add_section(
                 'Spectre CNV calls', 'Spectre CNV calls'):
 
@@ -132,28 +137,16 @@ def make_report(params, versions, cnv_df, args):
     return report
 
 
-def generate_karyotype(json_path):
-    """Convert karyotype JSON to string."""
-    with open(json_path, 'r') as json_file:
-        data = json.load(json_file)
-
-    count_x = data.get("X", 0)
-    count_y = data.get("Y", 0)
-
-    # construct string
-    karyotype = "X" * count_x + "Y" * count_y
-
-    return karyotype
-
-
 def main(args):
     """Run the entry point."""
     # read CNV BED into df
     try:
         cnv_df = pd.read_csv(args.cnv_bed, delim_whitespace=True, header=None)
         cnv_df.columns = ['chr', 'start', 'end', 'type', 'size', 'call']
 
-        filtered_df = cnv_df[cnv_df['chr'].isin(CHROMOSOMES)]
+        # hg19 chromosomes are read in as integers, so convert to string to ensure
+        # filtering is successful
+        filtered_df = cnv_df[cnv_df['chr'].astype(str).isin(CHROMOSOMES)]
     except pd.errors.EmptyDataError:
         filtered_df = pd.DataFrame()