From f66241476910512318fa35081885f059e1a10fe5 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 8 Dec 2020 15:07:52 -0500 Subject: [PATCH 01/52] Allow passing --libtype A for #3 --- pisces/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pisces/cli.py b/pisces/cli.py index 84e4cb7..3615730 100755 --- a/pisces/cli.py +++ b/pisces/cli.py @@ -209,7 +209,7 @@ def create_parser(args=None): '-l', '--libtype', type=str, - choices=('IU', 'ISF', 'ISR'), + choices=('IU', 'ISF', 'ISR', 'A'), help= "library geometry for Salmon (http://salmon.readthedocs.org/en/latest/salmon.html#what-s-this-libtype) default=auto" ) From 618305261b80ed82adc6b7be21662af7476ffd71 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 8 Dec 2020 15:11:15 -0500 Subject: [PATCH 02/52] Replace version number hardcoding --- pisces/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index aea1989..bca7d96 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -16,8 +16,13 @@ from multiprocessing import Process from tempfile import NamedTemporaryFile, mkdtemp from functools import partial +from pkg_resources import get_distribution, DistributionNotFound -__version__ = "0.1.1" +try: + __version__ = get_distribution(__name__).version +except DistributionNotFound: + # package is not installed + pass unique_id = ''.join(random.choice(string.digits) for _ in range(10)) From 51bc4dda70f3a5b813ef7efd7775348af96b5efe Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 8 Dec 2020 15:32:52 -0500 Subject: [PATCH 03/52] Don't expect version number to fail --- pisces/__init__.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index bca7d96..c00ed59 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -16,13 +16,9 @@ from multiprocessing import Process from tempfile import NamedTemporaryFile, mkdtemp from functools import partial -from pkg_resources import get_distribution, DistributionNotFound +from pkg_resources import get_distribution -try: - __version__ = get_distribution(__name__).version -except DistributionNotFound: - # package is not installed - pass +__version__ = get_distribution(__name__).version unique_id = ''.join(random.choice(string.digits) for _ in range(10)) From 9a1c64873a964ea6ac5c4e1336c096c8d67d9d8d Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 14 Dec 2020 13:10:58 -0500 Subject: [PATCH 04/52] Automatic libtype setting, allow fastqp to detect file type. --- pisces/__init__.py | 1 - pisces/run.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index c00ed59..00e2e8a 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -78,7 +78,6 @@ def read_level_qc(fastq, output_dir, sample_name): 'nreads': 2000000, 'base_probs': '0.25,0.25,0.25,0.25,0.1', 'kmer': 5, - 'type': 'fastq', 'leftlimit': 1, 'rightlimit': -1, 'median-qual': 30, diff --git a/pisces/run.py b/pisces/run.py index 3cd9369..a187c11 100644 --- a/pisces/run.py +++ b/pisces/run.py @@ -112,6 +112,8 @@ def run(args, unknown_args): except RuntimeError: if not args.sample_type or not args.libtype: logging.error("Genotyping failed: please specify --sample-type and --libtype parameters manually.") + logging.error("Running salmon with --libtype A") + libtype = 'A' if args.libtype: libtype = args.libtype From 4169ace918d4ccd7f7e337a3ce0f671e6c91a509 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 14 Dec 2020 13:16:37 -0500 Subject: [PATCH 05/52] Get version number from installed package --- pisces/cli.py | 4 +++- pisces/index.py | 4 +++- pisces/qc.py | 4 +++- pisces/run.py | 4 +++- pisces/submit.py | 3 ++- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/pisces/cli.py b/pisces/cli.py index 3615730..5db3e61 100755 --- a/pisces/cli.py +++ b/pisces/cli.py @@ -20,7 +20,9 @@ from subprocess import Popen, PIPE, call from collections import OrderedDict, defaultdict from tqdm import tqdm -from pisces import __version__, find_data_directory +from pisces import find_data_directory + +__version__ = get_distribution("pisces").version data_dir = find_data_directory() diff --git a/pisces/index.py b/pisces/index.py index 4082586..317cb42 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -1,6 +1,8 @@ import logging import os -from pisces import __version__, find_data_directory +from pisces import find_data_directory + +__version__ = get_distribution("pisces").version def build_index(args, unknown_args): from pyfaidx import Fasta diff --git a/pisces/qc.py b/pisces/qc.py index 9c6fceb..278d308 100644 --- a/pisces/qc.py +++ b/pisces/qc.py @@ -1,8 +1,10 @@ from pisces import __version__ from collections import OrderedDict, defaultdict -from pisces import __version__, find_data_directory +from pisces import find_data_directory from subprocess import Popen, PIPE, call +__version__ = get_distribution("pisces").version + data_dir = find_data_directory() class DefaultOrderedDict(OrderedDict): diff --git a/pisces/run.py b/pisces/run.py index a187c11..b7dc77c 100644 --- a/pisces/run.py +++ b/pisces/run.py @@ -4,9 +4,11 @@ import time import socket from multiprocessing import Process -from pisces import __version__, find_data_directory, long_substr +from pisces import find_data_directory, long_substr from itertools import chain +__version__ = get_distribution("pisces").version + data_dir = find_data_directory() def run(args, unknown_args): diff --git a/pisces/submit.py b/pisces/submit.py index 078d01b..a2565b6 100644 --- a/pisces/submit.py +++ b/pisces/submit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from pisces import __version__ from signal import signal, SIGINT, SIGTERM from subprocess import Popen, PIPE, call from tqdm import tqdm @@ -11,6 +10,8 @@ import pandas as pd import time +__version__ = get_distribution("pisces").version + def _submit_drmaa(args, unknown_args): """ Submit multiple 'pisces run' jobs to the cluster using libdrmaa """ From 133a627015aeb00b798adafc0dfc9c740d75c541 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 14 Dec 2020 13:21:20 -0500 Subject: [PATCH 06/52] use proper package name --- .gitignore | 1 + pisces/__init__.py | 2 +- pisces/cli.py | 2 +- pisces/index.py | 2 +- pisces/qc.py | 2 +- pisces/run.py | 2 +- pisces/submit.py | 2 +- 7 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e99e36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc \ No newline at end of file diff --git a/pisces/__init__.py b/pisces/__init__.py index 00e2e8a..680d54c 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -18,7 +18,7 @@ from functools import partial from pkg_resources import get_distribution -__version__ = get_distribution(__name__).version +__version__ = get_distribution("novartis_pisces").version unique_id = ''.join(random.choice(string.digits) for _ in range(10)) diff --git a/pisces/cli.py b/pisces/cli.py index 5db3e61..79cb812 100755 --- a/pisces/cli.py +++ b/pisces/cli.py @@ -22,7 +22,7 @@ from tqdm import tqdm from pisces import find_data_directory -__version__ = get_distribution("pisces").version +__version__ = get_distribution("novartis_pisces").version data_dir = find_data_directory() diff --git a/pisces/index.py b/pisces/index.py index 317cb42..ff878ad 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -2,7 +2,7 @@ import os from pisces import find_data_directory -__version__ = get_distribution("pisces").version +__version__ = get_distribution("novartis_pisces").version def build_index(args, unknown_args): from pyfaidx import Fasta diff --git a/pisces/qc.py b/pisces/qc.py index 278d308..ea16a4d 100644 --- a/pisces/qc.py +++ b/pisces/qc.py @@ -3,7 +3,7 @@ from pisces import find_data_directory from subprocess import Popen, PIPE, call -__version__ = get_distribution("pisces").version +__version__ = get_distribution("novartis_pisces").version data_dir = find_data_directory() diff --git a/pisces/run.py b/pisces/run.py index b7dc77c..8a3aff4 100644 --- a/pisces/run.py +++ b/pisces/run.py @@ -7,7 +7,7 @@ from pisces import find_data_directory, long_substr from itertools import chain -__version__ = get_distribution("pisces").version +__version__ = get_distribution("novartis_pisces").version data_dir = find_data_directory() diff --git a/pisces/submit.py b/pisces/submit.py index a2565b6..90cf975 100644 --- a/pisces/submit.py +++ b/pisces/submit.py @@ -10,7 +10,7 @@ import pandas as pd import time -__version__ = get_distribution("pisces").version +__version__ = get_distribution("novartis_pisces").version def _submit_drmaa(args, unknown_args): """ Submit multiple 'pisces run' jobs to the cluster using libdrmaa """ From 26d8e861d6bf23fdf0bc689bfb44f84c3d302c58 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 14 Dec 2020 13:26:10 -0500 Subject: [PATCH 07/52] import from pkg_resources --- pisces/cli.py | 1 + pisces/fastq.py | 1 + pisces/index.py | 1 + pisces/qc.py | 1 + pisces/run.py | 1 + pisces/submit.py | 1 + 6 files changed, 6 insertions(+) diff --git a/pisces/cli.py b/pisces/cli.py index 79cb812..3dabd81 100755 --- a/pisces/cli.py +++ b/pisces/cli.py @@ -21,6 +21,7 @@ from collections import OrderedDict, defaultdict from tqdm import tqdm from pisces import find_data_directory +from pkg_resources import get_distribution __version__ = get_distribution("novartis_pisces").version diff --git a/pisces/fastq.py b/pisces/fastq.py index 83ba6d3..09001eb 100755 --- a/pisces/fastq.py +++ b/pisces/fastq.py @@ -12,6 +12,7 @@ from subprocess import Popen, PIPE from io import TextIOWrapper import os.path +from pkg_resources import get_distribution if PY2: import string diff --git a/pisces/index.py b/pisces/index.py index ff878ad..f1f3378 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -1,6 +1,7 @@ import logging import os from pisces import find_data_directory +from pkg_resources import get_distribution __version__ = get_distribution("novartis_pisces").version diff --git a/pisces/qc.py b/pisces/qc.py index ea16a4d..8907ce4 100644 --- a/pisces/qc.py +++ b/pisces/qc.py @@ -2,6 +2,7 @@ from collections import OrderedDict, defaultdict from pisces import find_data_directory from subprocess import Popen, PIPE, call +from pkg_resources import get_distribution __version__ = get_distribution("novartis_pisces").version diff --git a/pisces/run.py b/pisces/run.py index 8a3aff4..2adc886 100644 --- a/pisces/run.py +++ b/pisces/run.py @@ -6,6 +6,7 @@ from multiprocessing import Process from pisces import find_data_directory, long_substr from itertools import chain +from pkg_resources import get_distribution __version__ = get_distribution("novartis_pisces").version diff --git a/pisces/submit.py b/pisces/submit.py index 90cf975..266a1aa 100644 --- a/pisces/submit.py +++ b/pisces/submit.py @@ -9,6 +9,7 @@ import pickle import pandas as pd import time +from pkg_resources import get_distribution __version__ = get_distribution("novartis_pisces").version From e765a0c862209f549ad62c3936141d89322b2cad Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 14 Dec 2020 15:45:17 -0500 Subject: [PATCH 08/52] Pass type:None for fastqp --- pisces/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pisces/__init__.py b/pisces/__init__.py index 680d54c..6d7c300 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -78,6 +78,7 @@ def read_level_qc(fastq, output_dir, sample_name): 'nreads': 2000000, 'base_probs': '0.25,0.25,0.25,0.25,0.1', 'kmer': 5, + 'type': None, 'leftlimit': 1, 'rightlimit': -1, 'median-qual': 30, From 2aee0bdfd26067ae956a5b9fca2cd2cb8cb1999a Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 18 Dec 2020 10:36:15 -0500 Subject: [PATCH 09/52] adding renv.lock file --- pisces/R/renv.lock | 249 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 pisces/R/renv.lock diff --git a/pisces/R/renv.lock b/pisces/R/renv.lock new file mode 100644 index 0000000..d405715 --- /dev/null +++ b/pisces/R/renv.lock @@ -0,0 +1,249 @@ +{ + "R": { + "Version": "4.0.3", + "Repositories": [ + { + "Name": "BioCsoft", + "URL": "https://bioconductor.org/packages/3.12/bioc" + }, + { + "Name": "BioCann", + "URL": "https://bioconductor.org/packages/3.12/data/annotation" + }, + { + "Name": "BioCexp", + "URL": "https://bioconductor.org/packages/3.12/data/experiment" + }, + { + "Name": "BioCworkflows", + "URL": "https://bioconductor.org/packages/3.12/workflows" + }, + { + "Name": "CRAN", + "URL": "https://cloud.r-project.org" + } + ] + }, + "Bioconductor": { + "Version": "3.12" + }, + "Packages": { + "BH": { + "Package": "BH", + "Version": "1.72.0-3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8f9ce74c6417d61f0782cbae5fd2b7b0" + }, + "BiocManager": { + "Package": "BiocManager", + "Version": "1.30.10", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "db75371846625725e221470b310da1d5" + }, + "R6": { + "Package": "R6", + "Version": "2.4.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "292b54f8f4b94669b08f94e5acce6be2" + }, + "Rcpp": { + "Package": "Rcpp", + "Version": "1.0.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "125dc7a0ed375eb68c0ce533b48d291f" + }, + "assertthat": { + "Package": "assertthat", + "Version": "0.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "50c838a310445e954bc13f26f26a6ecf" + }, + "cli": { + "Package": "cli", + "Version": "2.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "ff0becff7bfdfe3f75d29aff8f3172dd" + }, + "clipr": { + "Package": "clipr", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "08cf4045c149a0f0eaf405324c7495bd" + }, + "crayon": { + "Package": "crayon", + "Version": "1.3.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "0d57bc8e27b7ba9e45dba825ebc0de6b" + }, + "digest": { + "Package": "digest", + "Version": "0.6.25", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "f697db7d92b7028c4b3436e9603fb636" + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.0.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4011f62581a34080e44105d4aa05a97f" + }, + "ellipsis": { + "Package": "ellipsis", + "Version": "0.3.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fd2844b3a43ae2d27e70ece2df1b4e2a" + }, + "fansi": { + "Package": "fansi", + "Version": "0.4.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7fce217eaaf8016e72065e85c73027b5" + }, + "generics": { + "Package": "generics", + "Version": "0.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8cff1d1391fd1ad8b65877f4c7f2e53" + }, + "glue": { + "Package": "glue", + "Version": "1.4.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "6efd734b14c6471cfe443345f3e35e29" + }, + "hms": { + "Package": "hms", + "Version": "0.5.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "726671f634529d470545f9fd1a9d1869" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.7.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1ec84e070b88b37ed169f19def40d47c" + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "361811f31f71f8a617a9a68bf63f1f42" + }, + "magrittr": { + "Package": "magrittr", + "Version": "1.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1bb58822a20301cee84a41678e25d9b7" + }, + "pillar": { + "Package": "pillar", + "Version": "1.4.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b01d2494d0e2b6b02fae40e1543fbcb0" + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "01f28d4278f15c76cddbea05899c5d6f" + }, + "purrr": { + "Package": "purrr", + "Version": "0.3.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "97def703420c8ab10d8f0e6c72101e02" + }, + "readr": { + "Package": "readr", + "Version": "1.3.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "af8ab99cd936773a148963905736907b" + }, + "renv": { + "Package": "renv", + "Version": "0.12.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b5510c50c7f31d453c385c7b460af2b9" + }, + "rlang": { + "Package": "rlang", + "Version": "0.4.8", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "843a6af51414bce7f8a8e372f11d6cd0" + }, + "stringi": { + "Package": "stringi", + "Version": "1.5.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "a063ebea753c92910a4cca7b18bc1f05" + }, + "stringr": { + "Package": "stringr", + "Version": "1.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "0759e6b6c0957edb1311028a49a35e76" + }, + "tibble": { + "Package": "tibble", + "Version": "3.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1c61e4cad000e03b1bd687db16a75926" + }, + "tidyr": { + "Package": "tidyr", + "Version": "1.0.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8187ca2c27e80978f5b143bec554a437" + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.1.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "6ea435c354e8448819627cf686f66e0a" + }, + "utf8": { + "Package": "utf8", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4a5081acfb7b81a572e4384a7aaf2af1" + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.3.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1739235995f08583db4095a28c357207" + } + } +} From 0f84c848790f6caa67c05183d9ec74db482f2863 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 18 Dec 2020 10:43:39 -0500 Subject: [PATCH 10/52] Use lockfile for dependency management. --- pisces/R/set_up_dependencies.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pisces/R/set_up_dependencies.R b/pisces/R/set_up_dependencies.R index e5c899a..abacf64 100755 --- a/pisces/R/set_up_dependencies.R +++ b/pisces/R/set_up_dependencies.R @@ -12,7 +12,4 @@ script.dir <- getScriptPath() setwd(script.dir) renv::settings$use.cache(FALSE, persist = TRUE) renv::consent(provided = TRUE) -renv::install("BiocManager") -options(repos = BiocManager::repositories()) -renv::hydrate(packages=renv::dependencies()$Package) -renv::snapshot() \ No newline at end of file +renv::restore() \ No newline at end of file From 9cc2974b1228d06f614da4284c6ed88f285b7a03 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 18 Dec 2020 13:31:23 -0500 Subject: [PATCH 11/52] Revert "Use lockfile for dependency management." This reverts commit 0f84c848790f6caa67c05183d9ec74db482f2863. --- pisces/R/set_up_dependencies.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pisces/R/set_up_dependencies.R b/pisces/R/set_up_dependencies.R index abacf64..e5c899a 100755 --- a/pisces/R/set_up_dependencies.R +++ b/pisces/R/set_up_dependencies.R @@ -12,4 +12,7 @@ script.dir <- getScriptPath() setwd(script.dir) renv::settings$use.cache(FALSE, persist = TRUE) renv::consent(provided = TRUE) -renv::restore() \ No newline at end of file +renv::install("BiocManager") +options(repos = BiocManager::repositories()) +renv::hydrate(packages=renv::dependencies()$Package) +renv::snapshot() \ No newline at end of file From d62fdf6cee64aba7000e146d1d2e95d40eaac1e6 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 18 Dec 2020 13:31:40 -0500 Subject: [PATCH 12/52] Revert "adding renv.lock file" This reverts commit 2aee0bdfd26067ae956a5b9fca2cd2cb8cb1999a. --- pisces/R/renv.lock | 249 --------------------------------------------- 1 file changed, 249 deletions(-) delete mode 100644 pisces/R/renv.lock diff --git a/pisces/R/renv.lock b/pisces/R/renv.lock deleted file mode 100644 index d405715..0000000 --- a/pisces/R/renv.lock +++ /dev/null @@ -1,249 +0,0 @@ -{ - "R": { - "Version": "4.0.3", - "Repositories": [ - { - "Name": "BioCsoft", - "URL": "https://bioconductor.org/packages/3.12/bioc" - }, - { - "Name": "BioCann", - "URL": "https://bioconductor.org/packages/3.12/data/annotation" - }, - { - "Name": "BioCexp", - "URL": "https://bioconductor.org/packages/3.12/data/experiment" - }, - { - "Name": "BioCworkflows", - "URL": "https://bioconductor.org/packages/3.12/workflows" - }, - { - "Name": "CRAN", - "URL": "https://cloud.r-project.org" - } - ] - }, - "Bioconductor": { - "Version": "3.12" - }, - "Packages": { - "BH": { - "Package": "BH", - "Version": "1.72.0-3", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "8f9ce74c6417d61f0782cbae5fd2b7b0" - }, - "BiocManager": { - "Package": "BiocManager", - "Version": "1.30.10", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "db75371846625725e221470b310da1d5" - }, - "R6": { - "Package": "R6", - "Version": "2.4.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "292b54f8f4b94669b08f94e5acce6be2" - }, - "Rcpp": { - "Package": "Rcpp", - "Version": "1.0.5", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "125dc7a0ed375eb68c0ce533b48d291f" - }, - "assertthat": { - "Package": "assertthat", - "Version": "0.2.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "50c838a310445e954bc13f26f26a6ecf" - }, - "cli": { - "Package": "cli", - "Version": "2.0.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "ff0becff7bfdfe3f75d29aff8f3172dd" - }, - "clipr": { - "Package": "clipr", - "Version": "0.7.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "08cf4045c149a0f0eaf405324c7495bd" - }, - "crayon": { - "Package": "crayon", - "Version": "1.3.4", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "0d57bc8e27b7ba9e45dba825ebc0de6b" - }, - "digest": { - "Package": "digest", - "Version": "0.6.25", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "f697db7d92b7028c4b3436e9603fb636" - }, - "dplyr": { - "Package": "dplyr", - "Version": "1.0.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "4011f62581a34080e44105d4aa05a97f" - }, - "ellipsis": { - "Package": "ellipsis", - "Version": "0.3.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "fd2844b3a43ae2d27e70ece2df1b4e2a" - }, - "fansi": { - "Package": "fansi", - "Version": "0.4.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "7fce217eaaf8016e72065e85c73027b5" - }, - "generics": { - "Package": "generics", - "Version": "0.0.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "b8cff1d1391fd1ad8b65877f4c7f2e53" - }, - "glue": { - "Package": "glue", - "Version": "1.4.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "6efd734b14c6471cfe443345f3e35e29" - }, - "hms": { - "Package": "hms", - "Version": "0.5.3", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "726671f634529d470545f9fd1a9d1869" - }, - "jsonlite": { - "Package": "jsonlite", - "Version": "1.7.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "1ec84e070b88b37ed169f19def40d47c" - }, - "lifecycle": { - "Package": "lifecycle", - "Version": "0.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "361811f31f71f8a617a9a68bf63f1f42" - }, - "magrittr": { - "Package": "magrittr", - "Version": "1.5", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "1bb58822a20301cee84a41678e25d9b7" - }, - "pillar": { - "Package": "pillar", - "Version": "1.4.4", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "b01d2494d0e2b6b02fae40e1543fbcb0" - }, - "pkgconfig": { - "Package": "pkgconfig", - "Version": "2.0.3", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "01f28d4278f15c76cddbea05899c5d6f" - }, - "purrr": { - "Package": "purrr", - "Version": "0.3.4", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "97def703420c8ab10d8f0e6c72101e02" - }, - "readr": { - "Package": "readr", - "Version": "1.3.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "af8ab99cd936773a148963905736907b" - }, - "renv": { - "Package": "renv", - "Version": "0.12.3", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "b5510c50c7f31d453c385c7b460af2b9" - }, - "rlang": { - "Package": "rlang", - "Version": "0.4.8", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "843a6af51414bce7f8a8e372f11d6cd0" - }, - "stringi": { - "Package": "stringi", - "Version": "1.5.3", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "a063ebea753c92910a4cca7b18bc1f05" - }, - "stringr": { - "Package": "stringr", - "Version": "1.4.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "0759e6b6c0957edb1311028a49a35e76" - }, - "tibble": { - "Package": "tibble", - "Version": "3.0.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "1c61e4cad000e03b1bd687db16a75926" - }, - "tidyr": { - "Package": "tidyr", - "Version": "1.0.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "8187ca2c27e80978f5b143bec554a437" - }, - "tidyselect": { - "Package": "tidyselect", - "Version": "1.1.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "6ea435c354e8448819627cf686f66e0a" - }, - "utf8": { - "Package": "utf8", - "Version": "1.1.4", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "4a5081acfb7b81a572e4384a7aaf2af1" - }, - "vctrs": { - "Package": "vctrs", - "Version": "0.3.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "1739235995f08583db4095a28c357207" - } - } -} From 9f7ba0ee8d0a3b4faf41f0b3de0389d050e01a49 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 13 Jan 2021 14:30:40 -0500 Subject: [PATCH 13/52] Write out effective length matrices for transcripts and genes (#6) --- pisces/R/make_expression_matrix.R | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index 77c03b4..369b302 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -172,6 +172,13 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { as.data.frame() %>% format(nsmall = 3, scientific = F, trim = T) %>% write.table(paste0(args[["--name"]], ".", species, ".isoforms.counts.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) + + as.data.frame(txi$length[tx.idx,]) %>% + tibble::rownames_to_column("transcript_id") %>% + left_join(tx2gene) %>% + as.data.frame() %>% + format(nsmall = 3, scientific = F, trim = T) %>% + write.table(paste0(args[["--name"]], ".", species, ".isoforms.length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) message("Writing intron level tables...") as.data.frame(txi$abundance[intron.idx,]) %>% @@ -187,6 +194,12 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { format(nsmall = 3, scientific = F, trim = T) %>% write.table(paste0(args[["--name"]], ".", species, ".introns.counts.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) + as.data.frame(txi$length[intron.idx,]) %>% + tibble::rownames_to_column("gene_id") %>% + as.data.frame() %>% + format(nsmall = 3, scientific = F, trim = T) %>% + write.table(paste0(args[["--name"]], ".", species, ".introns.length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) + message("Writing intergene level tables...") as.data.frame(txi$abundance[intergene.idx,]) %>% rescaleTPM() %>% @@ -201,6 +214,12 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { format(nsmall = 3, scientific = F, trim = T) %>% write.table(paste0(args[["--name"]], ".", species, ".intergenes.counts.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) + as.data.frame(txi$length[intergene.idx,]) %>% + tibble::rownames_to_column("gene_id") %>% + as.data.frame() %>% + format(nsmall = 3, scientific = F, trim = T) %>% + write.table(paste0(args[["--name"]], ".", species, ".intergenes.length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) + message("Building transcript QC measure table...") pct_intronic <- colSums(txi$counts[intron.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100 pct_intergenic <- colSums(txi$counts[intergene.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100 @@ -309,6 +328,12 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { tibble::rownames_to_column("gene_id") %>% inner_join(annotation) %>% write.table(paste0(args[["--name"]], ".", species, ".counts.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) + + message("Writing effective length matrix...") + as.data.frame(txi.gene$length) %>% + tibble::rownames_to_column("gene_id") %>% + inner_join(annotation) %>% + write.table(paste0(args[["--name"]], ".", species, ".length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) if (is.character(args[["--deseq-formula"]])) { message(paste("DEseq formula:", args[["--deseq-formula"]])) From 2ced2a2575039e16b1374d6abd4c06543a7ce516 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 8 Feb 2021 21:20:45 -0500 Subject: [PATCH 14/52] Allow extra_fastas remote URIs --- pisces/index.py | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index f1f3378..62184b6 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -61,12 +61,55 @@ def build_index(args, unknown_args): transcripts_fasta_file = os.path.join( index_dir_path, "transcripts", "transcripts.fa") + with open(transcripts_fasta_file, 'w') as transcripts_fasta: - for fasta in dataset["extra_fastas"]: - with open(fasta) as extra: + ## all of this URI handling should probably use an existing library like + ## https://github.com/intake/filesystem_spec + for fasta_loc in dataset["extra_fastas"]: + fasta = urlparse(fasta_loc) + if fasta.scheme == '': + reference = Fasta(fasta.path) + elif fasta.scheme.lower() in ('ftp', 'http', 'https'): + _fasta_local_path = os.path.join( + download_dir, os.path.basename(fasta.path)) + logging.info("Downloading %s", fasta.geturl()) + if not os.path.exists(_fasta_local_path): + with urlopen(fasta.geturl()) as _fasta: + with open(_fasta_local_path, + 'wb') as _fasta_local: + copyfileobj(_fasta, _fasta_local) + _fasta_local.flush() + if fasta.path.endswith('gz'): + logging.info("Decompressing %s", + fasta.geturl()) + call( + ' '.join([ + 'gzip -dc', _fasta_local_path, + '>', + _fasta_local_path.replace( + ".gz", "") + ]), + shell=True) + if _fasta_local_path.endswith("2bit"): + logging.info("Converting %s to FASTA format", + fasta.geturl()) + twobit = TwoBitFile(_fasta_local_path) + if not os.path.exists( + _fasta_local_path.replace("2bit", "fa")): + with open( + _fasta_local_path.replace( + "2bit", "fa"), 'w') as fasta: + for chrom in twobit.keys(): + fasta.write(">%s\n" % chrom) + fasta.write(str(twobit[chrom]) + '\n') + reference = Fasta( + _fasta_local_path.replace("2bit", "fa")) + + with open(_fasta_local_path) as extra: logging.info("Adding entries from %s", fasta) for line in extra: transcripts_fasta.write(line) + for gtf_loc, fasta_loc in zip(dataset["gtfs"], dataset["fastas"]): gtf = urlparse(gtf_loc) From 4665240b09224274fe6ad78aaa019bd0bf2a3e6e Mon Sep 17 00:00:00 2001 From: David Barrat Date: Tue, 16 Feb 2021 23:30:30 +0100 Subject: [PATCH 15/52] remove .DS_Store files --- .gitignore | 3 ++- docsource/.DS_Store | Bin 10244 -> 0 bytes docsource/figures/.DS_Store | Bin 6148 -> 0 bytes pisces/.DS_Store | Bin 10244 -> 0 bytes pisces/R/.DS_Store | Bin 6148 -> 0 bytes pisces/scripts/.DS_Store | Bin 6148 -> 0 bytes 6 files changed, 2 insertions(+), 1 deletion(-) delete mode 100644 docsource/.DS_Store delete mode 100755 docsource/figures/.DS_Store delete mode 100755 pisces/.DS_Store delete mode 100755 pisces/R/.DS_Store delete mode 100755 pisces/scripts/.DS_Store diff --git a/.gitignore b/.gitignore index 7e99e36..0205d62 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*.pyc \ No newline at end of file +*.pyc +.DS_Store diff --git a/docsource/.DS_Store b/docsource/.DS_Store deleted file mode 100644 index 4a8d67e8fe3fd6e3cafd8bd7a299769dbf39f185..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10244 zcmeHMYit}>6+Xvy?3vz9JZ_r!5j$DejpHO6vVO&Ok~(X9)5J;IJnSUS13SArQ)k3@ z$KBbr>$oW{v_Go0DN-Ty34I`;Dycw-6x4_Eqk^U_5<*xaEh;|%!4G~E1b+}haPGY` zEAMV>1pG)8%}6t6?m74D+{DXmk!1=XA<7~u;bXd_jux{WloRjcMdqIOx#MD zY2j=cCRI6g%t{%~bib2L8Ls%9A!$%mRZ++En%UX*NJlu-*4c3`9GdOwiiATQ?OV^C zQbooL_d-LovER8%63xYq~~o{APy*J^clVybFR{TO5>Ts3#_eR5Dw%z@J!G9V-D63|&Q7G2eUd4@@RCV}E zlt{X(O4ltBl^wC;3DedMRl0R_s_fX5?32FFQ}%3A*$Ab<1U1>M!my|A-O1P}|MLF0 zn>jFPrl?g{>GQnrtby%Pl;hmmXBei@6m8+41`r)>ibiCKNLYbd2tzL%h7*t^R8PZG z@HCuEk#%uh5vB6qB~bLFbFcW>C-*0rNAH<|_G zmj|(hs1i_;SP>ZVF(PqWT~k}f>eV%iq5TM<&U{GW(9tXAMeRn`q!3zkMHJO$)}j)m zb(Gd-7FHC!MB28nR+Uh!mr7fQx>X^(>dU0HN8PRxqV+Oq?Ny@+0aq`dudAu8?jKN9 z9e9GNiy&|b{JjPj;cdd+FX0*??sxDf!rb2pZxvW2@U|N3aSg5~%x%O?xEZ%%7xobD zIO^`j-8hJcaR`s#QNmuFuxFr&<2Z>AqKg^A;Foa@zk*-I$MHOVjga_t!s55^^#b$_ z`nZ!4FI^9P1^iF}4-hMi6IG|d%wvbEJ~nR@Raei>cDA;MLxLN6+Iom5B5hqo`yby|Hsg^1r;t2Oc*%gb@`6>Y7y zPEi)7yZm^Iwvi{E$bG5V%(n1k6H|b6ZDSog8|Bm>Rb8xyr=UMYNaYUJEAl@#^GJ1{ zHo&KWf@w(F)@r-+C`3X6AeYJcG|~VO=!c^)3X`-fdJ-1_;cCb99H-F?Uxd-ku1P!0qx{F1f^Rg0;RGeA%6b9Yw`d8Z5Z-%^JhS?|2K`Lmt(>H|JL3A{~H>MYmfi{ diff --git a/docsource/figures/.DS_Store b/docsource/figures/.DS_Store deleted file mode 100755 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 zt3?x{M2U(I3K|6g4UqUEg1mTv2n1i$CWsMV4C0f%XhdK1J7;Fw?UW{HY68hjGV`6= zcjlbm{%5{9X9*z?=~c=J2@*m$JgJogyxifS_4FKdWVt0GT0nmi>$7@dx*5a#AER3P z1wH|vfKR|D;1lo(+zSLSX48WUIM%=V1bhNMfgu9){Sd^H+OTT}9LruEcwtKbwAon9 z4opilfi~#cuxke#i#2$FXN$Nqz? zIf1Za{i{#FCoq}-UAt!!g~-Gva_0Iy)}w2d9SU8CtO?l@C*^SbWIS>wOZmCmn!C)t zW?Qy+(I-SPx-7t=T zAm%o#Tvf1Ih)PrF@HY0?nrT?AYQjPrmGUg)v?gvE?(m~heq7e$y}E3xwYuJ{?NP0^ zxT*Jc8y0OHPnf#iXj&TWrlZ|fciFVfsH*nOmYJ~Ym}9$!{X4CsUQ2CKv2!h{*5fTK ziSXRXyqU!hJhUKGz2vb$DSyh;f@y-V8Q~hX_JpP;HuX`$9kLZy4Mj6{v?VECU2QSs zZZ#juc%FF`&Ju;K3ii3dRH&aKo#C9CIkQA@3$3Y3sCZR4jycRN5rq~D2QSHP!+E8G z(DEey#**cBTDDLSH|cUr)g^^@+{!Biu`TKFlh(O~i>3);d)(~q#z9+d+0tdAxSig1 zy=>VVdsG95beZ0JMYuMCLmF#d9n_~*QRqNR(!YI4xYS9Ogx%;-5)L^fJcgxWQbFoT zJLx0|k|HmW*U1s`A^C`$CFjTm@+G-OZjf8#SMnQfdfg^}LKfsfK1_kBPz0qgA0CEE zSPV;H87zkeSPSc613V6GunnTn2{P=21lZ6ENq7dHhZkT!9E78A435JI_yEqpCvXYA zg0JBl_#SS;FW_#iWu-N|Ha!o=!fmuek2Gr7-IB|CZ43>sT(x@ontNHp*Yh0ZPMq`7K(y2foY%CQdvtAnaXK)QH>x-+3Y!>*=5VIb`q02%?9QO0zM%$ zSyzhP0LPDqQm>Lh@*`8}U*vDd0RakOF6#3U)MgkWsLYkH3L2paT3{2jqBdoSg90iT z(1QxKP^0~*(7o_1JO?kr0eBf+fmh)$yajK=J8%R}!YMe7dOZv0;A8k4uEHSPa5Ooj z)pZ`N{*bBFWOKsMO=Aa^QEdZh$_Plmr`7xI&>NT!@FJhb-ys~iAt)*pWd@QK)o9Sw zq-tzTtjE`Y6S@9lgvyM1JYN#-`V>b@@P&&3X=W<5q#VOg`QoaPA&KV-=T$as-?{g| zp~G*|c;pR7`K*)S3k^qz!2@OzzKB$e*&4Bkoq^~|8qu{hB904iA~!Z5Hj8l~ke+yD zWldSIvNGhI_>2e3A6?Vgy?amUss5+maAr6Zh4Y4I%u4$` z+Sb_aeFsmSy>Ri1OP8u)8?g@E?s>*^D z)gB%^_pjx3hpBpCOdN*bsqt|2vpxZzfKTAQBQU`kaggr+H;n!N|9v+vf51KgpTOM{ zfb9C_`Utj9?0Wyf&rUq)LE8;Dmbu_%#2*2eq1*9nR|V_by%gpb#&YUE;8^C4 jdFGBbn--&`?$kQ_G2{McK)33z|NZs<_^p2L!}b3^2i(r& diff --git a/pisces/R/.DS_Store b/pisces/R/.DS_Store deleted file mode 100755 index b89866d200cf60844e8d2b82f5c72bcb9520f3ce..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKJB|V|47C9XB$_F=%oVsngxC{sfnAtxkwARR?m|262~u5 zUNIhvh%PVfLZlIq8Qf5=HuTN*&3iV=hyvj_<0#*9Zcp3YX5Y(x9x(1y4&ZI&=bpdc z*%ploPys4H1*iZO7+8TUu(SEV=khozKn31k0sB4_xM5A~0{zp0!CL^}1YtMKy_WzM z3jk|k7l;T^#6|}uBZSN_*V+(WVK!`@uaM+y~kOtE$}b6<=o(Am^%f7mt&xpV=Sy3e|S>l a6`NzfCU${NN8IT^{tTEdG%E0F1%3g&>lKUu diff --git a/pisces/scripts/.DS_Store b/pisces/scripts/.DS_Store deleted file mode 100755 index 9533b8ae2b6f4247cc7b31aa98079224189c678c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKJx{|h5PgOYl~^h!#{2=OU|^1_upt)245d_1sg#fpB)0r?-t!qmM21dPh0fCX zo$q|-^Q&sd0K%+&ItL~I#uQI0HLjfOodq_{h+EXTTY7 z27VaO`H)eBdBV;x9v$rB6M#6PIX3EgODHBwm?!KE>0t{NO0>|Bo*2QxY0uKUJYi>O z;RtbM=1D7mKVCvyo%SqFN5~DmcLtn+J_Cm~oap|);$LR)k>3yTku%^7{4)l^sJJa= z+*G<-|7=fpZAN*ZsA*m&g^haeBY*>)BPY>m{!BXO Date: Thu, 4 Mar 2021 14:13:45 -0500 Subject: [PATCH 16/52] Separate package installation from R/binary dependency management --- README.md | 2 +- pisces/__init__.py | 37 +++++++++++++++++++++++++++++++++++- pisces/cli.py | 17 ----------------- setup.py | 47 +--------------------------------------------- 4 files changed, 38 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 5e2b8f9..6f64e15 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Matthew D Shirley, Viveksagar K Radhakrishna, Javad Golji, Joshua M Korn. (Prepr Installation: ``` $ pip install --user novartis-pisces -$ pisces dependencies +$ pisces_dependencies ``` Submitting jobs to an HPC cluster: diff --git a/pisces/__init__.py b/pisces/__init__.py index 6d7c300..8d62550 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -22,11 +22,46 @@ unique_id = ''.join(random.choice(string.digits) for _ in range(10)) - def find_data_directory(): """ Returns the path to the module directory """ return os.path.dirname(os.path.abspath(__file__)) +def install_salmon(): + """Install a version of salmon from bioconda""" + import glob + import requests + from io import BytesIO + from urllib.request import urlopen + from shutil import rmtree + from subprocess import call + + redist = os.path.join(find_data_directory(), 'redist') + rmtree(os.path.join(redist, "salmon"), ignore_errors=True) + + if platform.system() == "Linux": + salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/linux-64/salmon-1.3.0-hf69c8f4_0.tar.bz2" + elif platform.system() == "Darwin": + salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/osx-64/salmon-1.3.0-hb70dc8d_0.tar.bz2" + + print("Installing salmon") + with requests.get(salmon_url, allow_redirects=True) as response: + tar_file = BytesIO(response.content) + tar_file.seek(0) + with tarfile.open(fileobj=tar_file) as tar: + tar.extractall(path=os.path.join(redist, "salmon")) + +def install_r_dependencies(): + """Install R dependencies""" + cmd = [ + 'Rscript', + os.path.join(find_data_directory(), 'R/set_up_dependencies.R') + ] + call(cmd) + +def install_dependencies(): + install_salmon() + install_r_dependencies() + def sra_valid_accession(accession): """ Test whether a string is an SRA accession """ if accession.startswith('SRR') and len(accession) == 10: diff --git a/pisces/cli.py b/pisces/cli.py index 3dabd81..518bdb0 100755 --- a/pisces/cli.py +++ b/pisces/cli.py @@ -59,16 +59,6 @@ def call_Rscript(args, unknown_args): ] cmd.extend(unknown_args) call(cmd) - -def call_Rscript_dependencies(args, unknown_args): - """ Call an R script, passing through arguments from argparse. """ - cmd = [ - 'Rscript', - os.path.join(data_dir, 'R/set_up_dependencies.R') - ] - cmd.extend(unknown_args) - call(cmd) - def default_species_index(conf): """ Return a list of the default index builds for each species defined in args.conf """ @@ -249,13 +239,6 @@ def create_parser(args=None): "Compile an expression matrix from multiple individual PISCES runs", add_help=False) matrix.set_defaults(func=call_Rscript) - - dependencies = subparsers.add_parser( - 'dependencies', - help= - "Install R dependencies for PISCES", - add_help=False) - dependencies.set_defaults(func=call_Rscript_dependencies) qc = subparsers.add_parser( 'summarize-qc', diff --git a/setup.py b/setup.py index 79a07dc..b5d3bdd 100644 --- a/setup.py +++ b/setup.py @@ -6,58 +6,13 @@ import logging import platform from setuptools import setup -from setuptools.command.develop import develop -from setuptools.command.install import install install_requires = [ 'six', 'setuptools >= 0.7', 'simplesam >= 0.0.3', 'tqdm', 'fastqp >= 0.2', 'pandas', 'strandex', 'gffutils', 'pyfaidx', 'drmaa', 'twobitreader', 'requests' ] - -class PostDevelopCommand(develop): - def run(self): - develop.run(self) - install_binary_dependencies() - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_binary_dependencies() - - -def install_binary_dependencies(): - sys.path.pop(0) # ignore local search path - import pisces - import glob - import requests - from io import BytesIO - from urllib.request import urlopen - from shutil import rmtree - from subprocess import call - - redist = os.path.join(os.path.dirname(pisces.__file__), 'redist') - rmtree(os.path.join(redist, "salmon"), ignore_errors=True) - local_redist = os.path.join(os.path.dirname(__file__), 'pisces/redist') - - if platform.system() == "Linux": - salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/linux-64/salmon-1.3.0-hf69c8f4_0.tar.bz2" - elif platform.system() == "Darwin": - salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/osx-64/salmon-1.3.0-hb70dc8d_0.tar.bz2" - - print("Installing salmon") - with requests.get(salmon_url, allow_redirects=True) as response: - tar_file = BytesIO(response.content) - tar_file.seek(0) - with tarfile.open(fileobj=tar_file) as tar: - tar.extractall(path=os.path.join(redist, "salmon")) - setup( - cmdclass={ - 'develop': PostDevelopCommand, - 'install': PostInstallCommand, - }, name='novartis-pisces', author='Matthew Shirley', author_email='matt_d.shirley@novartis.com', @@ -70,7 +25,7 @@ def install_binary_dependencies(): install_requires=install_requires, use_scm_version={"local_scheme": "no-local-version"}, setup_requires=['setuptools_scm'], - entry_points={'console_scripts': ['pisces = pisces.cli:main']}, + entry_points={'console_scripts': ['pisces = pisces.cli:main', 'pisces_dependencies = pisces:install_dependencies']}, classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: Apache Software License", From e0177a7571ad99e84470d807d4dc6e8b02e102d3 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 4 Mar 2021 14:35:14 -0500 Subject: [PATCH 17/52] missing import --- pisces/__init__.py | 1 + setup.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index 8d62550..f0b38e3 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -11,6 +11,7 @@ import string import random import shutil +import platform from pprint import pformat from subprocess import Popen, PIPE from multiprocessing import Process diff --git a/setup.py b/setup.py index b5d3bdd..53238d5 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,6 @@ import zipfile import subprocess import logging -import platform from setuptools import setup install_requires = [ From dea7a0b9419e14a09efe3634bb2650146ef6a6ba Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 4 Mar 2021 19:35:56 -0500 Subject: [PATCH 18/52] Move imports --- pisces/__init__.py | 1 + setup.py | 6 ------ 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index f0b38e3..6ce91e4 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -12,6 +12,7 @@ import random import shutil import platform +import tarfile from pprint import pformat from subprocess import Popen, PIPE from multiprocessing import Process diff --git a/setup.py b/setup.py index 53238d5..32e0329 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,3 @@ -import os -import sys -import tarfile -import zipfile -import subprocess -import logging from setuptools import setup install_requires = [ From 47335dd8a43d876c287a0a07780386f0c68cf95d Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 4 Mar 2021 19:51:37 -0500 Subject: [PATCH 19/52] import call --- pisces/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index 6ce91e4..2858ec3 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -14,7 +14,7 @@ import platform import tarfile from pprint import pformat -from subprocess import Popen, PIPE +from subprocess import Popen, PIPE, call from multiprocessing import Process from tempfile import NamedTemporaryFile, mkdtemp from functools import partial From 0e44850d0931a66c9f9f3cc0191723b43d80bd8d Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 24 Mar 2021 10:49:22 -0400 Subject: [PATCH 20/52] Negate filtering --- pisces/R/make_expression_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index 369b302..c071679 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -389,7 +389,7 @@ deseq_analysis <- function(contrasts, txi.gene, contrast.metadata, formula, fail } else { deseq.dataset <- DESeqDataSetFromTximport(txi.gene, contrast.metadata, as.formula(paste0("~", contrasts$Factor[1])))} message(paste("Filtering", length(failing.quartile.filter), "genes failing --quartile-expression cutoff from DESeq2 dataset.")) - deseq.dataset <- deseq.dataset[failing.quartile.filter,] + deseq.dataset <- deseq.dataset[-failing.quartile.filter,] message("Running DESeq2...") deseq.dataset <- DESeq(deseq.dataset) From f321fc1456cca931ff1f01c9bd1a8a92d16b9e14 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 24 Mar 2021 10:54:48 -0400 Subject: [PATCH 21/52] Better document first_quartile --- pisces/R/make_expression_matrix.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index c071679..e4155fb 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -269,7 +269,14 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { "genes.")) ribo.genes <- which(grepl("^RP[SL]", scaled_df[, "symbol"], ignore.case = T)) - first_quartile <- function(x) { y <- quantile(x, c(0.25, 0.5, 0.75), type=1) + + + first_quartile <- function(x) { + # > seq(10) + # [1] 1 2 3 4 5 6 7 8 9 10 + # > first_quartile(seq(10)) + #[1] 3 + y <- quantile(x, c(0.25, 0.5, 0.75), type=1) return(y[[1]]) } From bfc6805d9e66a03d9d0ea82ab15695b43756c342 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 24 Mar 2021 11:01:40 -0400 Subject: [PATCH 22/52] Take the top quantile instead of bottom --- pisces/R/make_expression_matrix.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index e4155fb..b7dda00 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -275,9 +275,9 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { # > seq(10) # [1] 1 2 3 4 5 6 7 8 9 10 # > first_quartile(seq(10)) - #[1] 3 + #[1] 8 y <- quantile(x, c(0.25, 0.5, 0.75), type=1) - return(y[[1]]) + return(y[[3]]) } message(paste("Excluding", length(ribo.genes), "ribosomal genes from TMM scaling factor calulation.")) From ab6760a1e72604a1dca873375bac81646c772a08 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 24 Mar 2021 11:09:29 -0400 Subject: [PATCH 23/52] function doc --- pisces/R/make_expression_matrix.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index b7dda00..4f74ca4 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -272,10 +272,8 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { first_quartile <- function(x) { - # > seq(10) - # [1] 1 2 3 4 5 6 7 8 9 10 - # > first_quartile(seq(10)) - #[1] 8 + # > first_quartile(c(0,0,0,0,0,0,0,1,10,10)) + # [1] 1 y <- quantile(x, c(0.25, 0.5, 0.75), type=1) return(y[[3]]) } From 89aadee0b196c569729358fc7b8300efbab6d4c5 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 15 Apr 2021 10:50:55 -0400 Subject: [PATCH 24/52] Fix exon ordering for minus strand transcripts in #15 --- pisces/index.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pisces/index.py b/pisces/index.py index 62184b6..5d2dc0f 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -266,10 +266,16 @@ def features_to_string(features, fasta_in, masked=True, strand=True): """ """ sequences = [] + feature_strand = "." for feature in features: + feature_strand = feature.strand sequences.append( feature.sequence( fasta_in, use_strand=strand)) + # if the transcript is on the reverse strand, reverse order of exons + # before concatenating + if feature_strand == "-": + sequences = sequences[::-1] seq = ''.join(sequences) mask_count = sum(seq.count(a) for a in soft_chars) if masked: From f0a0f0f3a7c07e767cd3b1006ae2fa5c7116c17d Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 9 Jul 2021 14:44:43 -0400 Subject: [PATCH 25/52] Enable selective alignment and gcBias flags --- pisces/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index 2858ec3..b41e8d2 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -271,7 +271,7 @@ def format_salmon_command(libtype, threads, index, output_dir, read1, read2, 'bin', 'salmon'), '--no-version-check', 'quant', '-q', '--index', index, '--libType', libtype, '--mates1', ' '.join(read1), '--mates2', ' '.join(read2), '--output', output_dir, '--threads', - str(threads), '--seqBias', '--useVBOpt' + str(threads), '--seqBias', '--gcBias', '--validateMappings', '--dumpEq' ] elif not read2: cmd = [ @@ -279,7 +279,7 @@ def format_salmon_command(libtype, threads, index, output_dir, read1, read2, 'bin', 'salmon'), '--no-version-check', 'quant', '-q', '--index', index, '--libType', libtype, '-r', ' '.join(read1), '--output', output_dir, '--threads', - str(threads), '--seqBias', '--useVBOpt' + str(threads), '--seqBias', '--gcBias', '--validateMappings', '--dumpEq' ] if make_bam: import shlex From 4b62d5f74e7d1c21d7bf6ef51df34ae446b61c5f Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 1 Sep 2021 21:32:15 -0400 Subject: [PATCH 26/52] Add gene-level effective lengths (#6) --- pisces/R/make_expression_matrix.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index 4f74ca4..6067823 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -239,11 +239,17 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { message("Writing annotation data matrix...") write.table(as.data.frame(annotation), paste0(args[["--name"]], ".", species, ".annotation.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) + + message("Making gene-level length matrix...") + as.data.frame(txi.gene$abundance) %>% tibble::rownames_to_column("gene_id") %>% + left_join(annotation) %>% as.data.frame %>% + write.table(format(., nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], + ".", species, ".length.txt"), quote = FALSE, sep = "\t", row.names = F, + col.names = T) - message("Making raw TPM matrix...") + message("Making gene-level TPM matrix...") raw_df <- as.data.frame(txi.gene$abundance) %>% tibble::rownames_to_column("gene_id") %>% left_join(annotation) %>% as.data.frame - if (!args[["--no-rescale"]]) { message(paste("Scaling TPM of", args[["--scale-tpm"]], "to 1e6...")) @@ -263,6 +269,7 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { write.table(format(normed_df, nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], ".", species, ".log2fc.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) + message(paste("Calculating TMM scaling factors using", args[["--scale-tpm"]], From ea86b2c1993cb02b1647a98ee9745483614579e9 Mon Sep 17 00:00:00 2001 From: Peter Skewes-Cox Date: Thu, 2 Sep 2021 18:16:10 -0700 Subject: [PATCH 27/52] Fixes auto sample naming Proposed fix for #19 - could use some testing / review. --- pisces/run.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pisces/run.py b/pisces/run.py index 2adc886..2ba3d2f 100644 --- a/pisces/run.py +++ b/pisces/run.py @@ -62,12 +62,15 @@ def run(args, unknown_args): # set up the default sample name if not args.name: if args.fq2: - sample_name = long_substr(tuple(chain(args.fq1, args.fq2))) + sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq2.split()))) else: sample_name = long_substr(tuple(chain(args.fq1, args.fq1))) sample_name = sample_name.split('/')[-1].split('_')[0] logging.info("Sample name: %s", sample_name) - assert len(sample_name) > 0 + try: + assert len(sample_name) > 0 + except AssertionError: + raise RuntimeError("Output file naming failed; provide an output file basename with `--name`.") else: sample_name = args.name From 63ac08d7764a68f70abe63ce62ab01fb7b13b027 Mon Sep 17 00:00:00 2001 From: Peter Skewes-Cox Date: Tue, 7 Sep 2021 13:51:59 -0700 Subject: [PATCH 28/52] Added split for single read as well --- pisces/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pisces/run.py b/pisces/run.py index 2ba3d2f..3736a2a 100644 --- a/pisces/run.py +++ b/pisces/run.py @@ -64,7 +64,7 @@ def run(args, unknown_args): if args.fq2: sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq2.split()))) else: - sample_name = long_substr(tuple(chain(args.fq1, args.fq1))) + sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq1.split()))) sample_name = sample_name.split('/')[-1].split('_')[0] logging.info("Sample name: %s", sample_name) try: From f0c4508d4fba3c93d202f4c49ead7c49c1843945 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 8 Sep 2021 11:36:07 -0400 Subject: [PATCH 29/52] Fix gene-level lengths output --- pisces/R/make_expression_matrix.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index 6067823..27b5ba6 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -241,10 +241,9 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { ".annotation.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) message("Making gene-level length matrix...") - as.data.frame(txi.gene$abundance) %>% tibble::rownames_to_column("gene_id") %>% - left_join(annotation) %>% as.data.frame %>% - write.table(format(., nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], - ".", species, ".length.txt"), quote = FALSE, sep = "\t", row.names = F, + as.data.frame(txi.gene$length) %>% tibble::rownames_to_column("gene_id") %>% + left_join(annotation) %>% as.data.frame %>% format(nsmall = 3, scientific = F, trim = T) %>% + write.table(paste0(args[["--name"]], ".", species, ".length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) message("Making gene-level TPM matrix...") From 9e646770d4ff670cda567b67ecd1efc596b653c2 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 2 Nov 2021 11:05:05 -0400 Subject: [PATCH 30/52] Don't split fq path lists args.fq1 and args.fq2 are lists and not strings --- pisces/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pisces/run.py b/pisces/run.py index 3736a2a..e3ce4f2 100644 --- a/pisces/run.py +++ b/pisces/run.py @@ -62,9 +62,9 @@ def run(args, unknown_args): # set up the default sample name if not args.name: if args.fq2: - sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq2.split()))) + sample_name = long_substr(tuple(chain(args.fq1, args.fq2))) else: - sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq1.split()))) + sample_name = long_substr(tuple(chain(args.fq1, args.fq1))) sample_name = sample_name.split('/')[-1].split('_')[0] logging.info("Sample name: %s", sample_name) try: From 8d00f98f69667ab452933bfc53e74ae7504ab69e Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 3 Nov 2021 14:31:02 -0400 Subject: [PATCH 31/52] Don't drop dimensions for matrices with only one sample (#24) --- pisces/R/make_expression_matrix.R | 44 ++++++++++++++++++------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index 27b5ba6..abfb26f 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -126,11 +126,17 @@ NormalizeSamples <- function(abundances, metadata, args) { experimentals)] + 1) - log2(apply(abundances[, c(controls)] + 1, 1, median)) } + } else if (ncol(abundances) == 1) { + # only one sample + abundances <- log2(abundances + 1) - log2(abundances + 1) } else { # normalize to median of all samples message("Normalizing to median of all samples.") abundances <- log2(abundances + 1) - log2(apply(abundances + 1, 1, median)) } + } else if (ncol(abundances) == 1) { + # only one sample + abundances <- log2(abundances + 1) - log2(abundances + 1) } else { # no metadata file message("Normalizing to median of all samples.") @@ -139,10 +145,10 @@ NormalizeSamples <- function(abundances, metadata, args) { return(abundances) } -rescaleTPM <- function(df, columns = colnames(df), rows = 1:nrow(df), precision = 3) { - sums.before.norm <- apply(df[rows, columns], 2, sum) +rescaleTPM <- function(mat, columns = colnames(mat), rows = 1:nrow(mat), precision = 3) { + sums.before.norm <- apply(mat[rows, columns, drop=F], 2, sum) tpm.scale.factors <- 1e+06/sums.before.norm - scaled_df <- df + scaled_df <- mat scaled_df[rows, columns] <- data.frame(mapply("*", scaled_df[rows, columns], tpm.scale.factors)) scaled_df[, columns] <- round(scaled_df[, columns], precision) @@ -221,9 +227,9 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { write.table(paste0(args[["--name"]], ".", species, ".intergenes.length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) message("Building transcript QC measure table...") - pct_intronic <- colSums(txi$counts[intron.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100 - pct_intergenic <- colSums(txi$counts[intergene.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100 - pct_genic <- colSums(txi$counts[tx.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100 + pct_intronic <- colSums(txi$counts[intron.idx,,drop=F]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),,drop=F]) * 100 + pct_intergenic <- colSums(txi$counts[intergene.idx,,drop=F]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),,drop=F]) * 100 + pct_genic <- colSums(txi$counts[tx.idx,,drop=F]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),,drop=F]) * 100 rbind(pct_genic, pct_intronic, pct_intergenic) -> qc_table rownames(qc_table) <- c("exonic", "intronic", "intergenic") @@ -261,9 +267,9 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { ".", species, ".TPM.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) normed_df <- scaled_df - normed_df[, abundance.cols] <- NormalizeSamples(normed_df[, abundance.cols], + normed_df[, abundance.cols] <- NormalizeSamples(normed_df[, abundance.cols, drop=F], metadata, args) - normed_df[, abundance.cols] <- round(normed_df[, abundance.cols], 3) + normed_df[, abundance.cols] <- round(normed_df[, abundance.cols, drop=F], 3) message("Making log2 fold change matrix...") write.table(format(normed_df, nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], ".", species, ".log2fc.txt"), quote = FALSE, sep = "\t", row.names = F, @@ -285,7 +291,7 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { } message(paste("Excluding", length(ribo.genes), "ribosomal genes from TMM scaling factor calulation.")) - failing.quartile.filter <- which(apply(scaled_df[, abundance.cols], 1, first_quartile) < + failing.quartile.filter <- which(apply(scaled_df[, abundance.cols, drop=F], 1, first_quartile) < as.numeric(args[["quartile-expression"]])) message(paste(length(failing.quartile.filter), "genes failing --quartile-expression cutoff")) if (is.character(args[["--exclude-genes"]])) { @@ -309,26 +315,26 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { tmm.genes <- setdiff(tmm.genes, exclusion.list) - sums.before.norm <- apply(raw_df[tmm.genes, abundance.cols], 2, sum) + sums.before.norm <- apply(raw_df[tmm.genes, abundance.cols, drop=F], 2, sum) tpm.scale.factors <- 1e+06/sums.before.norm scaled_df <- raw_df if (!args[["--no-rescale"]]) { scaled_df[tmm.genes, abundance.cols] <- data.frame(mapply("*", scaled_df[tmm.genes, - abundance.cols], tpm.scale.factors)) + abundance.cols, drop=F], tpm.scale.factors)) } message(paste("Using", length(tmm.genes), "genes for TMM factor calculation")) - tmm.factors <- calcNormFactors(scaled_df[tmm.genes, abundance.cols]) + tmm.factors <- calcNormFactors(scaled_df[tmm.genes, abundance.cols, drop=F]) tmm_df <- scaled_df tmm_df[to.rescale, abundance.cols] <- data.frame(mapply("/", tmm_df[to.rescale, - abundance.cols], tmm.factors)) - tmm_df[, abundance.cols] <- round(tmm_df[, abundance.cols], 3) + abundance.cols, drop=F], tmm.factors)) + tmm_df[, abundance.cols] <- round(tmm_df[, abundance.cols, drop=F], 3) write.table(format(tmm_df, nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], ".", species, ".TPM.TMM-scaled.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T) normed_tmm_df <- tmm_df - normed_tmm_df[, abundance.cols] <- NormalizeSamples(normed_tmm_df[, abundance.cols], + normed_tmm_df[, abundance.cols] <- NormalizeSamples(normed_tmm_df[, abundance.cols, drop=F], metadata, args) - normed_tmm_df[, abundance.cols] <- round(normed_tmm_df[, abundance.cols], + normed_tmm_df[, abundance.cols] <- round(normed_tmm_df[, abundance.cols, drop=F], 3) write.table(format(normed_tmm_df, nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], ".", species, ".log2fc.TMM-scaled.txt"), quote = FALSE, @@ -531,9 +537,9 @@ min_length <- as.numeric(args[["--min-transcript-length"]]) min_idx <- which(apply(txi.salmon.isoform$length, 1, max) > min_length) message(paste("Removing", length(which(apply(txi.salmon.isoform$length, 1, max) <= min_length)), "transcripts less than", min_length, "bp.")) -txi.salmon.isoform$abundance <- txi.salmon.isoform$abundance[min_idx, ] -txi.salmon.isoform$counts <- txi.salmon.isoform$counts[min_idx, ] -txi.salmon.isoform$length <- txi.salmon.isoform$length[min_idx, ] +txi.salmon.isoform$abundance <- txi.salmon.isoform$abundance[min_idx,,drop=F] +txi.salmon.isoform$counts <- txi.salmon.isoform$counts[min_idx,,drop=F] +txi.salmon.isoform$length <- txi.salmon.isoform$length[min_idx,,drop=F] for (assembly in this.config[["fastas"]]) { message("Reading gene and transcript annotations.") From 051a474e9ad8845a3391c940b5aba9da7c4d7c40 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 17 Nov 2021 13:41:46 -0500 Subject: [PATCH 32/52] Allow user-defined GTF gene name and type tags --- pisces/config.json | 12 +++++++++--- pisces/index.py | 14 ++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/pisces/config.json b/pisces/config.json index d3f9873..2e49e3f 100755 --- a/pisces/config.json +++ b/pisces/config.json @@ -10,7 +10,9 @@ "extra_fastas": [], "options": { "-k": 31, - "infer_features": false + "infer_features": false, + "gtf_name_tag": "gene_name", + "gtf_type_tag": "gene_type" } } } @@ -26,7 +28,9 @@ "extra_fastas": [], "options": { "-k": 31, - "infer_features": false + "infer_features": false, + "gtf_name_tag": "gene_name", + "gtf_type_tag": "gene_type" } } } @@ -44,7 +48,9 @@ "extra_fastas": [], "options": { "-k": 31, - "infer_features": false + "infer_features": false, + "gtf_name_tag": "gene_name", + "gtf_type_tag": "gene_type" } } } diff --git a/pisces/index.py b/pisces/index.py index 5d2dc0f..ad7812b 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -320,12 +320,22 @@ def features_to_string(features, fasta_in, masked=True, strand=True): featuretype='transcript', order_by='start')) try: - gene_type = first_transcript['gene_type'][0] + if options["gene_type"] == True: + type_tag = "gene_type" + else: + type_tag = options["gene_type"] + gene_type = first_transcript[type_tag][0] except KeyError: + logging.info("No gene type tag found for %s", gene['gene_id'][0]) gene_type = 'NA' try: - gene_name = first_transcript['gene_name'][0] + if options["gene_name"] == True: + name_tag = "gene_name" + else: + name_tag = options["gene_name"] + gene_name = first_transcript[name_tag][0] except KeyError: + logging.info("No gene name tag found for %s", gene['gene_id'][0]) gene_name = 'NA' exons = db.children(gene, featuretype='exon', order_by='start') From 74da364d7cb17fc9471db15b4c1d35788c1451b8 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 17 Nov 2021 13:53:39 -0500 Subject: [PATCH 33/52] Invert default config value --- pisces/index.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index ad7812b..f9bc9b9 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -43,7 +43,7 @@ def build_index(args, unknown_args): if args.indices and index_name not in args.indices: continue pprint(dataset, indent=4) - options = defaultdict(lambda: True) + options = defaultdict(lambda: False) options.update(dataset["options"]) index_dir_path = os.path.join(index_dir_base, species, index_name) if os.path.exists(index_dir_path): @@ -320,7 +320,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): featuretype='transcript', order_by='start')) try: - if options["gene_type"] == True: + if not options["gene_type"]: type_tag = "gene_type" else: type_tag = options["gene_type"] @@ -329,7 +329,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): logging.info("No gene type tag found for %s", gene['gene_id'][0]) gene_type = 'NA' try: - if options["gene_name"] == True: + if not options["gene_name"]: name_tag = "gene_name" else: name_tag = options["gene_name"] From 79a17578d201c665f1ea0c2b75c6c2850e9ba106 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 17 Nov 2021 14:28:39 -0500 Subject: [PATCH 34/52] Revert config truthiness --- pisces/index.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index f9bc9b9..ad7812b 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -43,7 +43,7 @@ def build_index(args, unknown_args): if args.indices and index_name not in args.indices: continue pprint(dataset, indent=4) - options = defaultdict(lambda: False) + options = defaultdict(lambda: True) options.update(dataset["options"]) index_dir_path = os.path.join(index_dir_base, species, index_name) if os.path.exists(index_dir_path): @@ -320,7 +320,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): featuretype='transcript', order_by='start')) try: - if not options["gene_type"]: + if options["gene_type"] == True: type_tag = "gene_type" else: type_tag = options["gene_type"] @@ -329,7 +329,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): logging.info("No gene type tag found for %s", gene['gene_id'][0]) gene_type = 'NA' try: - if not options["gene_name"]: + if options["gene_name"] == True: name_tag = "gene_name" else: name_tag = options["gene_name"] From ce9e0c32c1e8dfb0f5cd9cca7886e2cbe11f5d73 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 17 Nov 2021 14:43:04 -0500 Subject: [PATCH 35/52] Write annotation before merging exons and calculating interfeatures --- pisces/index.py | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index ad7812b..a60baf6 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -297,23 +297,6 @@ def features_to_string(features, fasta_in, masked=True, strand=True): total=db.count_features_of_type('gene'), unit='gene') as pbar: for gene in db.features_of_type('gene'): - transcripts = db.children(gene, featuretype='transcript', order_by='start') - for transcript in transcripts: - fa_seq, frac_masked = features_to_string(db.children(transcript, - featuretype='exon', - order_by='start'), - reference, - masked=options["masked"]) - transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n') - transcripts_fasta.write(fa_seq + '\n') - if options["unprocessed_transcripts"]: - exons = db.children(gene, featuretype='exon', order_by='start') - merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive)) - introns = db.interfeatures(merged_exons, new_featuretype='intron') - transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n') - fa_seq, _ = features_to_string(introns, reference, masked=options["masked"]) - transcripts_fasta.write(fa_seq + '\n') - first_transcript = next( db.children( gene, @@ -338,8 +321,28 @@ def features_to_string(features, fasta_in, masked=True, strand=True): logging.info("No gene name tag found for %s", gene['gene_id'][0]) gene_name = 'NA' + transcripts = db.children(gene, featuretype='transcript', order_by='start') + for transcript in transcripts: + # Write entry in the transcripts to genes table + gene2tx.write("{txp}\t{gene}\n".format( + gene=gene['gene_id'][0], + txp=transcript['transcript_id'][0])) + # Construct the transcript sequences and write them to the FASTA + fa_seq, frac_masked = features_to_string(db.children(transcript, + featuretype='exon', + order_by='start'), + reference, + masked=options["masked"]) + transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n') + transcripts_fasta.write(fa_seq + '\n') + exons = db.children(gene, featuretype='exon', order_by='start') merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive)) + if options["unprocessed_transcripts"]: + introns = db.interfeatures(merged_exons, new_featuretype='intron') + transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n') + fa_seq, _ = features_to_string(introns, reference, masked=options["masked"]) + transcripts_fasta.write(fa_seq + '\n') annotation.write( "{gene}\t{type}\t{name}\t{chrom}\t{start}\t{stop}\t{length}\t{frac_masked}\n". @@ -352,14 +355,13 @@ def features_to_string(features, fasta_in, masked=True, strand=True): chrom=gene.chrom, length=sum(len(exon) for exon in merged_exons), frac_masked=str(frac_masked))) + transcripts = db.children( gene, featuretype='transcript', order_by='start') for transcript in transcripts: - gene2tx.write("{txp}\t{gene}\n".format( - gene=gene['gene_id'][0], - txp=transcript['transcript_id'][0])) + pbar.update(1) if options["intergenes"]: From 0218188e0d6f27a71e63fbecf7fe8ace7806d39c Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 17 Nov 2021 14:45:09 -0500 Subject: [PATCH 36/52] typo --- pisces/index.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index a60baf6..f1b0d6a 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -360,8 +360,6 @@ def features_to_string(features, fasta_in, masked=True, strand=True): gene, featuretype='transcript', order_by='start') - for transcript in transcripts: - pbar.update(1) if options["intergenes"]: From 14d0c2245301680049058be667bede4fa8b380d5 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 17 Nov 2021 15:02:19 -0500 Subject: [PATCH 37/52] Grab GTF tags from exons --- pisces/index.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index f1b0d6a..bdcf299 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -297,17 +297,17 @@ def features_to_string(features, fasta_in, masked=True, strand=True): total=db.count_features_of_type('gene'), unit='gene') as pbar: for gene in db.features_of_type('gene'): - first_transcript = next( + first_exon = next( db.children( gene, - featuretype='transcript', + featuretype='exon', order_by='start')) try: if options["gene_type"] == True: type_tag = "gene_type" else: type_tag = options["gene_type"] - gene_type = first_transcript[type_tag][0] + gene_type = first_exon[type_tag][0] except KeyError: logging.info("No gene type tag found for %s", gene['gene_id'][0]) gene_type = 'NA' @@ -316,7 +316,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): name_tag = "gene_name" else: name_tag = options["gene_name"] - gene_name = first_transcript[name_tag][0] + gene_name = first_exon[name_tag][0] except KeyError: logging.info("No gene name tag found for %s", gene['gene_id'][0]) gene_name = 'NA' From fb1115a7d910e365570da81639e332d3baca7527 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 30 Mar 2022 09:58:30 -0400 Subject: [PATCH 38/52] Don't don't coerce data.frame to vector when length(columns) == 1 This is another fix for #24 --- pisces/R/make_expression_matrix.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index abfb26f..06d5c84 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -150,8 +150,8 @@ rescaleTPM <- function(mat, columns = colnames(mat), rows = 1:nrow(mat), precisi tpm.scale.factors <- 1e+06/sums.before.norm scaled_df <- mat scaled_df[rows, columns] <- data.frame(mapply("*", scaled_df[rows, - columns], tpm.scale.factors)) - scaled_df[, columns] <- round(scaled_df[, columns], precision) + columns, drop=F], tpm.scale.factors)) + scaled_df[, columns] <- round(scaled_df[, columns, drop=F], precision) return(scaled_df) } From fc09cc1de2a48147bd61e5b4e119982743373aba Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 26 May 2022 14:51:06 +0000 Subject: [PATCH 39/52] Use zgrep --- pisces/__init__.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index b41e8d2..7c4062c 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -22,7 +22,7 @@ __version__ = get_distribution("novartis_pisces").version -unique_id = ''.join(random.choice(string.digits) for _ in range(10)) +unique_id = ''.join(random.choice(string.digits) for _ in range(10)) def find_data_directory(): """ Returns the path to the module directory """ @@ -36,10 +36,10 @@ def install_salmon(): from urllib.request import urlopen from shutil import rmtree from subprocess import call - + redist = os.path.join(find_data_directory(), 'redist') rmtree(os.path.join(redist, "salmon"), ignore_errors=True) - + if platform.system() == "Linux": salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/linux-64/salmon-1.3.0-hf69c8f4_0.tar.bz2" elif platform.system() == "Darwin": @@ -51,7 +51,7 @@ def install_salmon(): tar_file.seek(0) with tarfile.open(fileobj=tar_file) as tar: tar.extractall(path=os.path.join(redist, "salmon")) - + def install_r_dependencies(): """Install R dependencies""" cmd = [ @@ -59,7 +59,7 @@ def install_r_dependencies(): os.path.join(find_data_directory(), 'R/set_up_dependencies.R') ] call(cmd) - + def install_dependencies(): install_salmon() install_r_dependencies() @@ -69,7 +69,7 @@ def sra_valid_accession(accession): if accession.startswith('SRR') and len(accession) == 10: return True return False - + def long_substr(data): """ http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python """ substr = '' @@ -149,7 +149,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir, kmers2_outname, 'w') as kmers2_out: p1 = Popen( [ - 'fgrep', '-h', '-o', '-f', + 'zgrep', '-h', '-o', '-f', os.path.join(data_dir, 'data/fp_kmers_col1.txt'), ' '.join(fastq_1) ], @@ -157,7 +157,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir, stderr=PIPE) p2 = Popen( [ - 'fgrep', '-h', '-o', '-f', + 'zgrep', '-h', '-o', '-f', os.path.join(data_dir, 'data/fp_kmers_col1.txt'), ' '.join(fastq_2) ], @@ -169,7 +169,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir, with open(kmers1_outname, 'w') as kmers1_out: p1 = Popen( [ - 'fgrep', '-h', '-o', '-f', + 'zgrep', '-h', '-o', '-f', os.path.join(data_dir, 'data/fp_kmers_col1.txt'), ' '.join(fastq_1) ], From 6752dce522db8cfbdc416bb911ebaf6e05f74f42 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 26 May 2022 15:00:23 +0000 Subject: [PATCH 40/52] Update gitignore --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index 0205d62..436ac46 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ *.pyc .DS_Store +.eggs +.tox +build +dist +docs +*.egg-info \ No newline at end of file From 979dd9e3fb75db9bf0b0400b239f3b3491554af7 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 26 May 2022 15:01:21 +0000 Subject: [PATCH 41/52] Add Dockerfile --- Dockerfile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100755 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100755 index 0000000..b73f15c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM ubuntu:18.04 + +COPY . /pisces +WORKDIR /pisces + +# Install R +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' +RUN apt update +RUN apt install r-base + +# Install PISCES +RUN python setup.py install +RUN pisces dependencies + +ENTRYPOINT ["/usr/local/bin/pisces"] \ No newline at end of file From 420f8f064cac85cf99203f8caed2ac86f85ba59a Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 26 May 2022 15:25:37 -0400 Subject: [PATCH 42/52] Use fixed grep patterns --- pisces/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index 7c4062c..35e17cd 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -149,7 +149,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir, kmers2_outname, 'w') as kmers2_out: p1 = Popen( [ - 'zgrep', '-h', '-o', '-f', + 'zgrep', '-F', '-h', '-o', '-f', os.path.join(data_dir, 'data/fp_kmers_col1.txt'), ' '.join(fastq_1) ], @@ -157,7 +157,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir, stderr=PIPE) p2 = Popen( [ - 'zgrep', '-h', '-o', '-f', + 'zgrep', '-F', '-h', '-o', '-f', os.path.join(data_dir, 'data/fp_kmers_col1.txt'), ' '.join(fastq_2) ], @@ -169,7 +169,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir, with open(kmers1_outname, 'w') as kmers1_out: p1 = Popen( [ - 'zgrep', '-h', '-o', '-f', + 'zgrep', '-F', '-h', '-o', '-f', os.path.join(data_dir, 'data/fp_kmers_col1.txt'), ' '.join(fastq_1) ], From 01c5f5578248ba13a30b71d4c1866b4481787daa Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 5 Jan 2023 00:08:26 +0000 Subject: [PATCH 43/52] Use checklines=1 to avoid inconsistent dialect warnings --- pisces/index.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index bdcf299..de27b3a 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -61,7 +61,7 @@ def build_index(args, unknown_args): transcripts_fasta_file = os.path.join( index_dir_path, "transcripts", "transcripts.fa") - + with open(transcripts_fasta_file, 'w') as transcripts_fasta: ## all of this URI handling should probably use an existing library like ## https://github.com/intake/filesystem_spec @@ -104,12 +104,12 @@ def build_index(args, unknown_args): fasta.write(str(twobit[chrom]) + '\n') reference = Fasta( _fasta_local_path.replace("2bit", "fa")) - + with open(_fasta_local_path) as extra: logging.info("Adding entries from %s", fasta) for line in extra: transcripts_fasta.write(line) - + for gtf_loc, fasta_loc in zip(dataset["gtfs"], dataset["fastas"]): gtf = urlparse(gtf_loc) @@ -172,6 +172,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( gtf.path, database_filename, + checklines=1, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -188,6 +189,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( gtf.path, tmp_db, + checklines=1, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -220,6 +222,7 @@ def build_index(args, unknown_args): _gtf_local_path.replace(".gz", ""), _gtf_local_path.replace( ".gz", "") + '.db', + checklines=1, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -231,6 +234,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( _gtf_local_path, _gtf_local_path + '.db', + checklines=1, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -261,9 +265,9 @@ def build_index(args, unknown_args): gene_annotation = os.path.join( index_dir_path, assembly + "_gene_annotation.txt") - + def features_to_string(features, fasta_in, masked=True, strand=True): - """ + """ """ sequences = [] feature_strand = "." @@ -272,7 +276,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): sequences.append( feature.sequence( fasta_in, use_strand=strand)) - # if the transcript is on the reverse strand, reverse order of exons + # if the transcript is on the reverse strand, reverse order of exons # before concatenating if feature_strand == "-": sequences = sequences[::-1] @@ -320,7 +324,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): except KeyError: logging.info("No gene name tag found for %s", gene['gene_id'][0]) gene_name = 'NA' - + transcripts = db.children(gene, featuretype='transcript', order_by='start') for transcript in transcripts: # Write entry in the transcripts to genes table @@ -328,22 +332,22 @@ def features_to_string(features, fasta_in, masked=True, strand=True): gene=gene['gene_id'][0], txp=transcript['transcript_id'][0])) # Construct the transcript sequences and write them to the FASTA - fa_seq, frac_masked = features_to_string(db.children(transcript, - featuretype='exon', - order_by='start'), - reference, + fa_seq, frac_masked = features_to_string(db.children(transcript, + featuretype='exon', + order_by='start'), + reference, masked=options["masked"]) transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n') transcripts_fasta.write(fa_seq + '\n') - - exons = db.children(gene, featuretype='exon', order_by='start') + + exons = db.children(gene, featuretype='exon', order_by='start') merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive)) if options["unprocessed_transcripts"]: - introns = db.interfeatures(merged_exons, new_featuretype='intron') + introns = db.interfeatures(merged_exons, new_featuretype='intron') transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n') fa_seq, _ = features_to_string(introns, reference, masked=options["masked"]) transcripts_fasta.write(fa_seq + '\n') - + annotation.write( "{gene}\t{type}\t{name}\t{chrom}\t{start}\t{stop}\t{length}\t{frac_masked}\n". format( @@ -355,13 +359,13 @@ def features_to_string(features, fasta_in, masked=True, strand=True): chrom=gene.chrom, length=sum(len(exon) for exon in merged_exons), frac_masked=str(frac_masked))) - + transcripts = db.children( gene, featuretype='transcript', order_by='start') pbar.update(1) - + if options["intergenes"]: for seqid in reference.keys(): logging.info("Merging overlapping genes on %s", seqid) From 49b5fb59d5cd145155e83f9f6475aefd8ebcd9ce Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 5 Jan 2023 18:06:49 +0000 Subject: [PATCH 44/52] Allow a formal GTF dialect --- pisces/config.json | 3 +++ pisces/index.py | 22 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pisces/config.json b/pisces/config.json index 2e49e3f..733c79a 100755 --- a/pisces/config.json +++ b/pisces/config.json @@ -11,6 +11,7 @@ "options": { "-k": 31, "infer_features": false, + "infer_dialect": true, "gtf_name_tag": "gene_name", "gtf_type_tag": "gene_type" } @@ -29,6 +30,7 @@ "options": { "-k": 31, "infer_features": false, + "infer_dialect": true, "gtf_name_tag": "gene_name", "gtf_type_tag": "gene_type" } @@ -49,6 +51,7 @@ "options": { "-k": 31, "infer_features": false, + "infer_dialect": true, "gtf_name_tag": "gene_name", "gtf_type_tag": "gene_type" } diff --git a/pisces/index.py b/pisces/index.py index de27b3a..a57edd0 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -12,6 +12,7 @@ def build_index(args, unknown_args): import gffutils.merge_criteria as mc import atexit import shutil + import copy from tqdm import tqdm from collections import defaultdict from pprint import pprint @@ -159,6 +160,19 @@ def build_index(args, unknown_args): else: reference = Fasta(_fasta_local_path) + if not options["infer_dialect"]: + ## set up a GTF dialect for gffutils + from gffutils.constants import dialect as gff_dialect + dialect = copy.copy(gff_dialect) + dialect['field separator'] = '; ' + dialect['keyval separator'] = ' ' + dialect['fmt'] = 'gtf' + dialect['quoted GFF2 values'] = True + dialect['repeated keys'] = True + dialect['trailing semicolon'] = True + else: + dialect = None + if gtf.scheme == '': database_filename = gtf.path + '.db' if os.path.exists(database_filename): @@ -172,7 +186,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( gtf.path, database_filename, - checklines=1, + dialect=dialect, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -189,7 +203,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( gtf.path, tmp_db, - checklines=1, + dialect=dialect, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -222,7 +236,7 @@ def build_index(args, unknown_args): _gtf_local_path.replace(".gz", ""), _gtf_local_path.replace( ".gz", "") + '.db', - checklines=1, + dialect=dialect, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -234,7 +248,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( _gtf_local_path, _gtf_local_path + '.db', - checklines=1, + dialect=dialect, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= From a13cbedcfb81306aaaf7aafd53773b009e8b8a40 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 6 Jan 2023 14:24:26 +0000 Subject: [PATCH 45/52] Pull gene tags from gene record --- pisces/index.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index a57edd0..22a63c8 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -162,6 +162,7 @@ def build_index(args, unknown_args): if not options["infer_dialect"]: ## set up a GTF dialect for gffutils + ## this dialect works for https://github.com/daler/gffutils/issues/198 from gffutils.constants import dialect as gff_dialect dialect = copy.copy(gff_dialect) dialect['field separator'] = '; ' @@ -315,17 +316,12 @@ def features_to_string(features, fasta_in, masked=True, strand=True): total=db.count_features_of_type('gene'), unit='gene') as pbar: for gene in db.features_of_type('gene'): - first_exon = next( - db.children( - gene, - featuretype='exon', - order_by='start')) try: if options["gene_type"] == True: type_tag = "gene_type" else: type_tag = options["gene_type"] - gene_type = first_exon[type_tag][0] + gene_type = gene[type_tag][0] except KeyError: logging.info("No gene type tag found for %s", gene['gene_id'][0]) gene_type = 'NA' @@ -334,7 +330,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): name_tag = "gene_name" else: name_tag = options["gene_name"] - gene_name = first_exon[name_tag][0] + gene_name = gene[name_tag][0] except KeyError: logging.info("No gene name tag found for %s", gene['gene_id'][0]) gene_name = 'NA' From 5cbf32e809a823b720af5dedf8f3ca3fc17c6e25 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 6 Jan 2023 14:54:03 +0000 Subject: [PATCH 46/52] Use correct GTF tags for gene names and biotypes --- pisces/index.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index 22a63c8..de325d4 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -317,19 +317,19 @@ def features_to_string(features, fasta_in, masked=True, strand=True): unit='gene') as pbar: for gene in db.features_of_type('gene'): try: - if options["gene_type"] == True: - type_tag = "gene_type" + if options["gtf_type_tag"] == True: + type_tag = "gene_biotype" else: - type_tag = options["gene_type"] + type_tag = options["gtf_type_tag"] gene_type = gene[type_tag][0] except KeyError: logging.info("No gene type tag found for %s", gene['gene_id'][0]) gene_type = 'NA' try: - if options["gene_name"] == True: - name_tag = "gene_name" + if options["gtf_name_tag"] == True: + name_tag = "gene_id" else: - name_tag = options["gene_name"] + name_tag = options["gtf_name_tag"] gene_name = gene[name_tag][0] except KeyError: logging.info("No gene name tag found for %s", gene['gene_id'][0]) From a8fd0b04d39844b5ff3bf7770581750de603e0a4 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 1 Aug 2023 11:11:13 -0400 Subject: [PATCH 47/52] Update main.yml --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 22a44ce..29f3ec3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,7 +30,7 @@ jobs: python-version: ${{ matrix.python }} - name: Setup R - uses: r-lib/actions/setup-r@master + uses: r-lib/actions/setup-r@v2 with: r-version: '3.6.2' From 9936079ac75d4b75be95bad5bc962465e8c5f458 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 1 Aug 2023 14:47:09 -0400 Subject: [PATCH 48/52] Update publish-to-test-pypi.yml --- .github/workflows/publish-to-test-pypi.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml index 20abfa5..2b4dc28 100644 --- a/.github/workflows/publish-to-test-pypi.yml +++ b/.github/workflows/publish-to-test-pypi.yml @@ -4,7 +4,7 @@ on: push jobs: build-n-publish: name: Build and publish packages to PyPI and TestPyPI - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest steps: - name: Checkout @@ -34,4 +34,4 @@ jobs: if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@master with: - password: ${{ secrets.pypi_password }} \ No newline at end of file + password: ${{ secrets.pypi_password }} From 99fd110c4b55fad3298d17f5ac5ecfc4f1b6f7e0 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 28 May 2024 09:00:49 -0400 Subject: [PATCH 49/52] Update build_docs.yml --- .github/workflows/build_docs.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index a9b0252..93f80c3 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -27,6 +27,9 @@ jobs: uses: r-lib/actions/setup-r@master with: r-version: '3.6.2' + + - name: Install renv + run: Rscript -e "install.packages(c('renv'), repos='https://cran.rstudio.com')" - name: Install python packages run: pip install tox From 8739201b818457ba7e2c9a08aa0f6f794b2778ee Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 28 May 2024 09:06:21 -0400 Subject: [PATCH 50/52] Update build_docs.yml --- .github/workflows/build_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 93f80c3..ee2a79e 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -24,7 +24,7 @@ jobs: python-version: 3.9 - name: Setup R - uses: r-lib/actions/setup-r@master + uses: r-lib/actions/setup-r@v2 with: r-version: '3.6.2' From d6909a155d57679e8d81c9084e5f904fbdcd3ddb Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 28 May 2024 09:29:37 -0400 Subject: [PATCH 51/52] Add section describing composite transcriptomes --- .gitignore | 3 ++- docsource/running.rst | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 436ac46..e1d07cb 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ build dist docs -*.egg-info \ No newline at end of file +*.egg-info +.vscode/settings.json diff --git a/docsource/running.rst b/docsource/running.rst index 59649fe..68fcc09 100644 --- a/docsource/running.rst +++ b/docsource/running.rst @@ -65,6 +65,16 @@ explicitly, or default to automatic values. .. command-output:: pisces run --help +The (``-t, --sample-type``) flag is used to specify the species name, +which will select the index files ``salmon quant`` uses to quantify transcript expression from the selected +set of fastq files. The species name is the top level key in the :doc:`config` file. If the ``--sample-type`` +defines a transcriptome composed of transcripts from multiple species/assemblies (GTF/FASTA pairs), the +transcript expression will be quantified using a single combined index with transcript abundance jointly +calculated on all transcripts from the composite transcriptome. The ``pisces summarize-expression`` command will separate transcript and gene +expression estimantes (counts and TPM) into separate output tables for each species/assembly and re-scale TPM values as if the +transcript and gene expression were estimated independently. This method ensures that sequence reads which map ambiguously +between similar transcripts will be assigned to the most likely transcript during quantification. + .. _submit_example: pisces submit From 36ca8f16e6b763abf341d78a15844d40696e3f37 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 28 May 2024 09:38:00 -0400 Subject: [PATCH 52/52] Update pisces_dependencies command --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index d56bf87..cc32cda 100644 --- a/tox.ini +++ b/tox.ini @@ -10,7 +10,7 @@ changedir = docsource deps = sphinx sphinxcontrib.bibtex sphinxcontrib.programoutput -commands = pisces dependencies +commands = pisces_dependencies make github [testenv:package]