From f66241476910512318fa35081885f059e1a10fe5 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Tue, 8 Dec 2020 15:07:52 -0500
Subject: [PATCH 01/52] Allow passing --libtype A for #3

---
 pisces/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pisces/cli.py b/pisces/cli.py
index 84e4cb7..3615730 100755
--- a/pisces/cli.py
+++ b/pisces/cli.py
@@ -209,7 +209,7 @@ def create_parser(args=None):
         '-l',
         '--libtype',
         type=str,
-        choices=('IU', 'ISF', 'ISR'),
+        choices=('IU', 'ISF', 'ISR', 'A'),
         help=
         "library geometry for Salmon (http://salmon.readthedocs.org/en/latest/salmon.html#what-s-this-libtype) default=auto"
     )

From 618305261b80ed82adc6b7be21662af7476ffd71 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Tue, 8 Dec 2020 15:11:15 -0500
Subject: [PATCH 02/52] Replace version number hardcoding

---
 pisces/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index aea1989..bca7d96 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -16,8 +16,13 @@
 from multiprocessing import Process
 from tempfile import NamedTemporaryFile, mkdtemp
 from functools import partial
+from pkg_resources import get_distribution, DistributionNotFound
 
-__version__ = "0.1.1"
+try:
+    __version__ = get_distribution(__name__).version
+except DistributionNotFound:
+     # package is not installed
+    pass
 
 unique_id = ''.join(random.choice(string.digits) for _ in range(10))    
 

From 51bc4dda70f3a5b813ef7efd7775348af96b5efe Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Tue, 8 Dec 2020 15:32:52 -0500
Subject: [PATCH 03/52] Don't expect version number to fail

---
 pisces/__init__.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index bca7d96..c00ed59 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -16,13 +16,9 @@
 from multiprocessing import Process
 from tempfile import NamedTemporaryFile, mkdtemp
 from functools import partial
-from pkg_resources import get_distribution, DistributionNotFound
+from pkg_resources import get_distribution
 
-try:
-    __version__ = get_distribution(__name__).version
-except DistributionNotFound:
-     # package is not installed
-    pass
+__version__ = get_distribution(__name__).version
 
 unique_id = ''.join(random.choice(string.digits) for _ in range(10))    
 

From 9a1c64873a964ea6ac5c4e1336c096c8d67d9d8d Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Mon, 14 Dec 2020 13:10:58 -0500
Subject: [PATCH 04/52] Automatic libtype setting, allow fastqp to detect file
 type.

---
 pisces/__init__.py | 1 -
 pisces/run.py      | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index c00ed59..00e2e8a 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -78,7 +78,6 @@ def read_level_qc(fastq, output_dir, sample_name):
         'nreads': 2000000,
         'base_probs': '0.25,0.25,0.25,0.25,0.1',
         'kmer': 5,
-        'type': 'fastq',
         'leftlimit': 1,
         'rightlimit': -1,
         'median-qual': 30,
diff --git a/pisces/run.py b/pisces/run.py
index 3cd9369..a187c11 100644
--- a/pisces/run.py
+++ b/pisces/run.py
@@ -112,6 +112,8 @@ def run(args, unknown_args):
     except RuntimeError:
         if not args.sample_type or not args.libtype:
             logging.error("Genotyping failed: please specify --sample-type and --libtype parameters manually.")
+            logging.error("Running salmon with --libtype A")
+            libtype = 'A'
 
     if args.libtype:
         libtype = args.libtype

From 4169ace918d4ccd7f7e337a3ce0f671e6c91a509 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Mon, 14 Dec 2020 13:16:37 -0500
Subject: [PATCH 05/52] Get version number from installed package

---
 pisces/cli.py    | 4 +++-
 pisces/index.py  | 4 +++-
 pisces/qc.py     | 4 +++-
 pisces/run.py    | 4 +++-
 pisces/submit.py | 3 ++-
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/pisces/cli.py b/pisces/cli.py
index 3615730..5db3e61 100755
--- a/pisces/cli.py
+++ b/pisces/cli.py
@@ -20,7 +20,9 @@
 from subprocess import Popen, PIPE, call
 from collections import OrderedDict, defaultdict
 from tqdm import tqdm
-from pisces import __version__, find_data_directory
+from pisces import find_data_directory
+
+__version__ = get_distribution("pisces").version
 
 data_dir = find_data_directory()
 
diff --git a/pisces/index.py b/pisces/index.py
index 4082586..317cb42 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -1,6 +1,8 @@
 import logging
 import os
-from pisces import __version__, find_data_directory
+from pisces import find_data_directory
+
+__version__ = get_distribution("pisces").version
 
 def build_index(args, unknown_args):
     from pyfaidx import Fasta
diff --git a/pisces/qc.py b/pisces/qc.py
index 9c6fceb..278d308 100644
--- a/pisces/qc.py
+++ b/pisces/qc.py
@@ -1,8 +1,10 @@
 from pisces import __version__
 from collections import OrderedDict, defaultdict
-from pisces import __version__, find_data_directory
+from pisces import find_data_directory
 from subprocess import Popen, PIPE, call
 
+__version__ = get_distribution("pisces").version
+
 data_dir = find_data_directory()
 
 class DefaultOrderedDict(OrderedDict):
diff --git a/pisces/run.py b/pisces/run.py
index a187c11..b7dc77c 100644
--- a/pisces/run.py
+++ b/pisces/run.py
@@ -4,9 +4,11 @@
 import time
 import socket
 from multiprocessing import Process
-from pisces import __version__, find_data_directory, long_substr
+from pisces import find_data_directory, long_substr
 from itertools import chain
 
+__version__ = get_distribution("pisces").version
+
 data_dir = find_data_directory()
 
 def run(args, unknown_args):
diff --git a/pisces/submit.py b/pisces/submit.py
index 078d01b..a2565b6 100644
--- a/pisces/submit.py
+++ b/pisces/submit.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-from pisces import __version__
 from signal import signal, SIGINT, SIGTERM
 from subprocess import Popen, PIPE, call
 from tqdm import tqdm
@@ -11,6 +10,8 @@
 import pandas as pd
 import time
 
+__version__ = get_distribution("pisces").version
+
 def _submit_drmaa(args, unknown_args):
     """ Submit multiple 'pisces run' jobs to the cluster using libdrmaa """
 

From 133a627015aeb00b798adafc0dfc9c740d75c541 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Mon, 14 Dec 2020 13:21:20 -0500
Subject: [PATCH 06/52] use proper package name

---
 .gitignore         | 1 +
 pisces/__init__.py | 2 +-
 pisces/cli.py      | 2 +-
 pisces/index.py    | 2 +-
 pisces/qc.py       | 2 +-
 pisces/run.py      | 2 +-
 pisces/submit.py   | 2 +-
 7 files changed, 7 insertions(+), 6 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7e99e36
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc
\ No newline at end of file
diff --git a/pisces/__init__.py b/pisces/__init__.py
index 00e2e8a..680d54c 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -18,7 +18,7 @@
 from functools import partial
 from pkg_resources import get_distribution
 
-__version__ = get_distribution(__name__).version
+__version__ = get_distribution("novartis_pisces").version
 
 unique_id = ''.join(random.choice(string.digits) for _ in range(10))    
 
diff --git a/pisces/cli.py b/pisces/cli.py
index 5db3e61..79cb812 100755
--- a/pisces/cli.py
+++ b/pisces/cli.py
@@ -22,7 +22,7 @@
 from tqdm import tqdm
 from pisces import find_data_directory
 
-__version__ = get_distribution("pisces").version
+__version__ = get_distribution("novartis_pisces").version
 
 data_dir = find_data_directory()
 
diff --git a/pisces/index.py b/pisces/index.py
index 317cb42..ff878ad 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -2,7 +2,7 @@
 import os
 from pisces import find_data_directory
 
-__version__ = get_distribution("pisces").version
+__version__ = get_distribution("novartis_pisces").version
 
 def build_index(args, unknown_args):
     from pyfaidx import Fasta
diff --git a/pisces/qc.py b/pisces/qc.py
index 278d308..ea16a4d 100644
--- a/pisces/qc.py
+++ b/pisces/qc.py
@@ -3,7 +3,7 @@
 from pisces import find_data_directory
 from subprocess import Popen, PIPE, call
 
-__version__ = get_distribution("pisces").version
+__version__ = get_distribution("novartis_pisces").version
 
 data_dir = find_data_directory()
 
diff --git a/pisces/run.py b/pisces/run.py
index b7dc77c..8a3aff4 100644
--- a/pisces/run.py
+++ b/pisces/run.py
@@ -7,7 +7,7 @@
 from pisces import find_data_directory, long_substr
 from itertools import chain
 
-__version__ = get_distribution("pisces").version
+__version__ = get_distribution("novartis_pisces").version
 
 data_dir = find_data_directory()
 
diff --git a/pisces/submit.py b/pisces/submit.py
index a2565b6..90cf975 100644
--- a/pisces/submit.py
+++ b/pisces/submit.py
@@ -10,7 +10,7 @@
 import pandas as pd
 import time
 
-__version__ = get_distribution("pisces").version
+__version__ = get_distribution("novartis_pisces").version
 
 def _submit_drmaa(args, unknown_args):
     """ Submit multiple 'pisces run' jobs to the cluster using libdrmaa """

From 26d8e861d6bf23fdf0bc689bfb44f84c3d302c58 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Mon, 14 Dec 2020 13:26:10 -0500
Subject: [PATCH 07/52] import from pkg_resources

---
 pisces/cli.py    | 1 +
 pisces/fastq.py  | 1 +
 pisces/index.py  | 1 +
 pisces/qc.py     | 1 +
 pisces/run.py    | 1 +
 pisces/submit.py | 1 +
 6 files changed, 6 insertions(+)

diff --git a/pisces/cli.py b/pisces/cli.py
index 79cb812..3dabd81 100755
--- a/pisces/cli.py
+++ b/pisces/cli.py
@@ -21,6 +21,7 @@
 from collections import OrderedDict, defaultdict
 from tqdm import tqdm
 from pisces import find_data_directory
+from pkg_resources import get_distribution
 
 __version__ = get_distribution("novartis_pisces").version
 
diff --git a/pisces/fastq.py b/pisces/fastq.py
index 83ba6d3..09001eb 100755
--- a/pisces/fastq.py
+++ b/pisces/fastq.py
@@ -12,6 +12,7 @@
 from subprocess import Popen, PIPE
 from io import TextIOWrapper
 import os.path
+from pkg_resources import get_distribution
 
 if PY2:
     import string
diff --git a/pisces/index.py b/pisces/index.py
index ff878ad..f1f3378 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -1,6 +1,7 @@
 import logging
 import os
 from pisces import find_data_directory
+from pkg_resources import get_distribution
 
 __version__ = get_distribution("novartis_pisces").version
 
diff --git a/pisces/qc.py b/pisces/qc.py
index ea16a4d..8907ce4 100644
--- a/pisces/qc.py
+++ b/pisces/qc.py
@@ -2,6 +2,7 @@
 from collections import OrderedDict, defaultdict
 from pisces import find_data_directory
 from subprocess import Popen, PIPE, call
+from pkg_resources import get_distribution
 
 __version__ = get_distribution("novartis_pisces").version
 
diff --git a/pisces/run.py b/pisces/run.py
index 8a3aff4..2adc886 100644
--- a/pisces/run.py
+++ b/pisces/run.py
@@ -6,6 +6,7 @@
 from multiprocessing import Process
 from pisces import find_data_directory, long_substr
 from itertools import chain
+from pkg_resources import get_distribution
 
 __version__ = get_distribution("novartis_pisces").version
 
diff --git a/pisces/submit.py b/pisces/submit.py
index 90cf975..266a1aa 100644
--- a/pisces/submit.py
+++ b/pisces/submit.py
@@ -9,6 +9,7 @@
 import pickle
 import pandas as pd
 import time
+from pkg_resources import get_distribution
 
 __version__ = get_distribution("novartis_pisces").version
 

From e765a0c862209f549ad62c3936141d89322b2cad Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Mon, 14 Dec 2020 15:45:17 -0500
Subject: [PATCH 08/52] Pass type:None for fastqp

---
 pisces/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index 680d54c..6d7c300 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -78,6 +78,7 @@ def read_level_qc(fastq, output_dir, sample_name):
         'nreads': 2000000,
         'base_probs': '0.25,0.25,0.25,0.25,0.1',
         'kmer': 5,
+        'type': None,
         'leftlimit': 1,
         'rightlimit': -1,
         'median-qual': 30,

From 2aee0bdfd26067ae956a5b9fca2cd2cb8cb1999a Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Fri, 18 Dec 2020 10:36:15 -0500
Subject: [PATCH 09/52] adding renv.lock file

---
 pisces/R/renv.lock | 249 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 249 insertions(+)
 create mode 100644 pisces/R/renv.lock

diff --git a/pisces/R/renv.lock b/pisces/R/renv.lock
new file mode 100644
index 0000000..d405715
--- /dev/null
+++ b/pisces/R/renv.lock
@@ -0,0 +1,249 @@
+{
+  "R": {
+    "Version": "4.0.3",
+    "Repositories": [
+      {
+        "Name": "BioCsoft",
+        "URL": "https://bioconductor.org/packages/3.12/bioc"
+      },
+      {
+        "Name": "BioCann",
+        "URL": "https://bioconductor.org/packages/3.12/data/annotation"
+      },
+      {
+        "Name": "BioCexp",
+        "URL": "https://bioconductor.org/packages/3.12/data/experiment"
+      },
+      {
+        "Name": "BioCworkflows",
+        "URL": "https://bioconductor.org/packages/3.12/workflows"
+      },
+      {
+        "Name": "CRAN",
+        "URL": "https://cloud.r-project.org"
+      }
+    ]
+  },
+  "Bioconductor": {
+    "Version": "3.12"
+  },
+  "Packages": {
+    "BH": {
+      "Package": "BH",
+      "Version": "1.72.0-3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "8f9ce74c6417d61f0782cbae5fd2b7b0"
+    },
+    "BiocManager": {
+      "Package": "BiocManager",
+      "Version": "1.30.10",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "db75371846625725e221470b310da1d5"
+    },
+    "R6": {
+      "Package": "R6",
+      "Version": "2.4.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "292b54f8f4b94669b08f94e5acce6be2"
+    },
+    "Rcpp": {
+      "Package": "Rcpp",
+      "Version": "1.0.5",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "125dc7a0ed375eb68c0ce533b48d291f"
+    },
+    "assertthat": {
+      "Package": "assertthat",
+      "Version": "0.2.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "50c838a310445e954bc13f26f26a6ecf"
+    },
+    "cli": {
+      "Package": "cli",
+      "Version": "2.0.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "ff0becff7bfdfe3f75d29aff8f3172dd"
+    },
+    "clipr": {
+      "Package": "clipr",
+      "Version": "0.7.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "08cf4045c149a0f0eaf405324c7495bd"
+    },
+    "crayon": {
+      "Package": "crayon",
+      "Version": "1.3.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "0d57bc8e27b7ba9e45dba825ebc0de6b"
+    },
+    "digest": {
+      "Package": "digest",
+      "Version": "0.6.25",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "f697db7d92b7028c4b3436e9603fb636"
+    },
+    "dplyr": {
+      "Package": "dplyr",
+      "Version": "1.0.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "4011f62581a34080e44105d4aa05a97f"
+    },
+    "ellipsis": {
+      "Package": "ellipsis",
+      "Version": "0.3.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "fd2844b3a43ae2d27e70ece2df1b4e2a"
+    },
+    "fansi": {
+      "Package": "fansi",
+      "Version": "0.4.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "7fce217eaaf8016e72065e85c73027b5"
+    },
+    "generics": {
+      "Package": "generics",
+      "Version": "0.0.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "b8cff1d1391fd1ad8b65877f4c7f2e53"
+    },
+    "glue": {
+      "Package": "glue",
+      "Version": "1.4.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "6efd734b14c6471cfe443345f3e35e29"
+    },
+    "hms": {
+      "Package": "hms",
+      "Version": "0.5.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "726671f634529d470545f9fd1a9d1869"
+    },
+    "jsonlite": {
+      "Package": "jsonlite",
+      "Version": "1.7.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "1ec84e070b88b37ed169f19def40d47c"
+    },
+    "lifecycle": {
+      "Package": "lifecycle",
+      "Version": "0.2.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "361811f31f71f8a617a9a68bf63f1f42"
+    },
+    "magrittr": {
+      "Package": "magrittr",
+      "Version": "1.5",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "1bb58822a20301cee84a41678e25d9b7"
+    },
+    "pillar": {
+      "Package": "pillar",
+      "Version": "1.4.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "b01d2494d0e2b6b02fae40e1543fbcb0"
+    },
+    "pkgconfig": {
+      "Package": "pkgconfig",
+      "Version": "2.0.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "01f28d4278f15c76cddbea05899c5d6f"
+    },
+    "purrr": {
+      "Package": "purrr",
+      "Version": "0.3.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "97def703420c8ab10d8f0e6c72101e02"
+    },
+    "readr": {
+      "Package": "readr",
+      "Version": "1.3.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "af8ab99cd936773a148963905736907b"
+    },
+    "renv": {
+      "Package": "renv",
+      "Version": "0.12.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "b5510c50c7f31d453c385c7b460af2b9"
+    },
+    "rlang": {
+      "Package": "rlang",
+      "Version": "0.4.8",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "843a6af51414bce7f8a8e372f11d6cd0"
+    },
+    "stringi": {
+      "Package": "stringi",
+      "Version": "1.5.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "a063ebea753c92910a4cca7b18bc1f05"
+    },
+    "stringr": {
+      "Package": "stringr",
+      "Version": "1.4.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "0759e6b6c0957edb1311028a49a35e76"
+    },
+    "tibble": {
+      "Package": "tibble",
+      "Version": "3.0.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "1c61e4cad000e03b1bd687db16a75926"
+    },
+    "tidyr": {
+      "Package": "tidyr",
+      "Version": "1.0.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "8187ca2c27e80978f5b143bec554a437"
+    },
+    "tidyselect": {
+      "Package": "tidyselect",
+      "Version": "1.1.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "6ea435c354e8448819627cf686f66e0a"
+    },
+    "utf8": {
+      "Package": "utf8",
+      "Version": "1.1.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "4a5081acfb7b81a572e4384a7aaf2af1"
+    },
+    "vctrs": {
+      "Package": "vctrs",
+      "Version": "0.3.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "1739235995f08583db4095a28c357207"
+    }
+  }
+}

From 0f84c848790f6caa67c05183d9ec74db482f2863 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Fri, 18 Dec 2020 10:43:39 -0500
Subject: [PATCH 10/52] Use lockfile for dependency management.

---
 pisces/R/set_up_dependencies.R | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pisces/R/set_up_dependencies.R b/pisces/R/set_up_dependencies.R
index e5c899a..abacf64 100755
--- a/pisces/R/set_up_dependencies.R
+++ b/pisces/R/set_up_dependencies.R
@@ -12,7 +12,4 @@ script.dir <- getScriptPath()
 setwd(script.dir)
 renv::settings$use.cache(FALSE, persist = TRUE)
 renv::consent(provided = TRUE)
-renv::install("BiocManager")
-options(repos = BiocManager::repositories())
-renv::hydrate(packages=renv::dependencies()$Package)
-renv::snapshot()
\ No newline at end of file
+renv::restore()
\ No newline at end of file

From 9cc2974b1228d06f614da4284c6ed88f285b7a03 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Fri, 18 Dec 2020 13:31:23 -0500
Subject: [PATCH 11/52] Revert "Use lockfile for dependency management."

This reverts commit 0f84c848790f6caa67c05183d9ec74db482f2863.
---
 pisces/R/set_up_dependencies.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pisces/R/set_up_dependencies.R b/pisces/R/set_up_dependencies.R
index abacf64..e5c899a 100755
--- a/pisces/R/set_up_dependencies.R
+++ b/pisces/R/set_up_dependencies.R
@@ -12,4 +12,7 @@ script.dir <- getScriptPath()
 setwd(script.dir)
 renv::settings$use.cache(FALSE, persist = TRUE)
 renv::consent(provided = TRUE)
-renv::restore()
\ No newline at end of file
+renv::install("BiocManager")
+options(repos = BiocManager::repositories())
+renv::hydrate(packages=renv::dependencies()$Package)
+renv::snapshot()
\ No newline at end of file

From d62fdf6cee64aba7000e146d1d2e95d40eaac1e6 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Fri, 18 Dec 2020 13:31:40 -0500
Subject: [PATCH 12/52] Revert "adding renv.lock file"

This reverts commit 2aee0bdfd26067ae956a5b9fca2cd2cb8cb1999a.
---
 pisces/R/renv.lock | 249 ---------------------------------------------
 1 file changed, 249 deletions(-)
 delete mode 100644 pisces/R/renv.lock

diff --git a/pisces/R/renv.lock b/pisces/R/renv.lock
deleted file mode 100644
index d405715..0000000
--- a/pisces/R/renv.lock
+++ /dev/null
@@ -1,249 +0,0 @@
-{
-  "R": {
-    "Version": "4.0.3",
-    "Repositories": [
-      {
-        "Name": "BioCsoft",
-        "URL": "https://bioconductor.org/packages/3.12/bioc"
-      },
-      {
-        "Name": "BioCann",
-        "URL": "https://bioconductor.org/packages/3.12/data/annotation"
-      },
-      {
-        "Name": "BioCexp",
-        "URL": "https://bioconductor.org/packages/3.12/data/experiment"
-      },
-      {
-        "Name": "BioCworkflows",
-        "URL": "https://bioconductor.org/packages/3.12/workflows"
-      },
-      {
-        "Name": "CRAN",
-        "URL": "https://cloud.r-project.org"
-      }
-    ]
-  },
-  "Bioconductor": {
-    "Version": "3.12"
-  },
-  "Packages": {
-    "BH": {
-      "Package": "BH",
-      "Version": "1.72.0-3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "8f9ce74c6417d61f0782cbae5fd2b7b0"
-    },
-    "BiocManager": {
-      "Package": "BiocManager",
-      "Version": "1.30.10",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "db75371846625725e221470b310da1d5"
-    },
-    "R6": {
-      "Package": "R6",
-      "Version": "2.4.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "292b54f8f4b94669b08f94e5acce6be2"
-    },
-    "Rcpp": {
-      "Package": "Rcpp",
-      "Version": "1.0.5",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "125dc7a0ed375eb68c0ce533b48d291f"
-    },
-    "assertthat": {
-      "Package": "assertthat",
-      "Version": "0.2.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "50c838a310445e954bc13f26f26a6ecf"
-    },
-    "cli": {
-      "Package": "cli",
-      "Version": "2.0.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "ff0becff7bfdfe3f75d29aff8f3172dd"
-    },
-    "clipr": {
-      "Package": "clipr",
-      "Version": "0.7.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "08cf4045c149a0f0eaf405324c7495bd"
-    },
-    "crayon": {
-      "Package": "crayon",
-      "Version": "1.3.4",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "0d57bc8e27b7ba9e45dba825ebc0de6b"
-    },
-    "digest": {
-      "Package": "digest",
-      "Version": "0.6.25",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "f697db7d92b7028c4b3436e9603fb636"
-    },
-    "dplyr": {
-      "Package": "dplyr",
-      "Version": "1.0.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "4011f62581a34080e44105d4aa05a97f"
-    },
-    "ellipsis": {
-      "Package": "ellipsis",
-      "Version": "0.3.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "fd2844b3a43ae2d27e70ece2df1b4e2a"
-    },
-    "fansi": {
-      "Package": "fansi",
-      "Version": "0.4.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "7fce217eaaf8016e72065e85c73027b5"
-    },
-    "generics": {
-      "Package": "generics",
-      "Version": "0.0.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "b8cff1d1391fd1ad8b65877f4c7f2e53"
-    },
-    "glue": {
-      "Package": "glue",
-      "Version": "1.4.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "6efd734b14c6471cfe443345f3e35e29"
-    },
-    "hms": {
-      "Package": "hms",
-      "Version": "0.5.3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "726671f634529d470545f9fd1a9d1869"
-    },
-    "jsonlite": {
-      "Package": "jsonlite",
-      "Version": "1.7.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "1ec84e070b88b37ed169f19def40d47c"
-    },
-    "lifecycle": {
-      "Package": "lifecycle",
-      "Version": "0.2.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "361811f31f71f8a617a9a68bf63f1f42"
-    },
-    "magrittr": {
-      "Package": "magrittr",
-      "Version": "1.5",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "1bb58822a20301cee84a41678e25d9b7"
-    },
-    "pillar": {
-      "Package": "pillar",
-      "Version": "1.4.4",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "b01d2494d0e2b6b02fae40e1543fbcb0"
-    },
-    "pkgconfig": {
-      "Package": "pkgconfig",
-      "Version": "2.0.3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "01f28d4278f15c76cddbea05899c5d6f"
-    },
-    "purrr": {
-      "Package": "purrr",
-      "Version": "0.3.4",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "97def703420c8ab10d8f0e6c72101e02"
-    },
-    "readr": {
-      "Package": "readr",
-      "Version": "1.3.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "af8ab99cd936773a148963905736907b"
-    },
-    "renv": {
-      "Package": "renv",
-      "Version": "0.12.3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "b5510c50c7f31d453c385c7b460af2b9"
-    },
-    "rlang": {
-      "Package": "rlang",
-      "Version": "0.4.8",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "843a6af51414bce7f8a8e372f11d6cd0"
-    },
-    "stringi": {
-      "Package": "stringi",
-      "Version": "1.5.3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "a063ebea753c92910a4cca7b18bc1f05"
-    },
-    "stringr": {
-      "Package": "stringr",
-      "Version": "1.4.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "0759e6b6c0957edb1311028a49a35e76"
-    },
-    "tibble": {
-      "Package": "tibble",
-      "Version": "3.0.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "1c61e4cad000e03b1bd687db16a75926"
-    },
-    "tidyr": {
-      "Package": "tidyr",
-      "Version": "1.0.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "8187ca2c27e80978f5b143bec554a437"
-    },
-    "tidyselect": {
-      "Package": "tidyselect",
-      "Version": "1.1.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "6ea435c354e8448819627cf686f66e0a"
-    },
-    "utf8": {
-      "Package": "utf8",
-      "Version": "1.1.4",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "4a5081acfb7b81a572e4384a7aaf2af1"
-    },
-    "vctrs": {
-      "Package": "vctrs",
-      "Version": "0.3.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "1739235995f08583db4095a28c357207"
-    }
-  }
-}

From 9f7ba0ee8d0a3b4faf41f0b3de0389d050e01a49 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 13 Jan 2021 14:30:40 -0500
Subject: [PATCH 13/52] Write out effective length matrices for transcripts and
 genes (#6)

---
 pisces/R/make_expression_matrix.R | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R
index 77c03b4..369b302 100755
--- a/pisces/R/make_expression_matrix.R
+++ b/pisces/R/make_expression_matrix.R
@@ -172,6 +172,13 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
         as.data.frame() %>%
         format(nsmall = 3, scientific = F, trim = T) %>%
         write.table(paste0(args[["--name"]], ".", species, ".isoforms.counts.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
+        
+    as.data.frame(txi$length[tx.idx,]) %>% 
+        tibble::rownames_to_column("transcript_id") %>% 
+        left_join(tx2gene) %>%
+        as.data.frame() %>%
+        format(nsmall = 3, scientific = F, trim = T) %>%
+        write.table(paste0(args[["--name"]], ".", species, ".isoforms.length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
     
     message("Writing intron level tables...")
     as.data.frame(txi$abundance[intron.idx,]) %>% 
@@ -187,6 +194,12 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
         format(nsmall = 3, scientific = F, trim = T) %>%
         write.table(paste0(args[["--name"]], ".", species, ".introns.counts.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
         
+    as.data.frame(txi$length[intron.idx,]) %>% 
+        tibble::rownames_to_column("gene_id") %>% 
+        as.data.frame() %>%
+        format(nsmall = 3, scientific = F, trim = T) %>%
+        write.table(paste0(args[["--name"]], ".", species, ".introns.length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
+        
     message("Writing intergene level tables...")
     as.data.frame(txi$abundance[intergene.idx,]) %>% 
         rescaleTPM() %>%
@@ -201,6 +214,12 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
         format(nsmall = 3, scientific = F, trim = T) %>%
         write.table(paste0(args[["--name"]], ".", species, ".intergenes.counts.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
         
+    as.data.frame(txi$length[intergene.idx,]) %>% 
+        tibble::rownames_to_column("gene_id") %>% 
+        as.data.frame() %>%
+        format(nsmall = 3, scientific = F, trim = T) %>%
+        write.table(paste0(args[["--name"]], ".", species, ".intergenes.length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
+        
     message("Building transcript QC measure table...")
     pct_intronic <- colSums(txi$counts[intron.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100
     pct_intergenic <- colSums(txi$counts[intergene.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100
@@ -309,6 +328,12 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
         tibble::rownames_to_column("gene_id") %>% 
         inner_join(annotation) %>%
         write.table(paste0(args[["--name"]], ".", species, ".counts.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
+        
+    message("Writing effective length matrix...")
+    as.data.frame(txi.gene$length) %>% 
+        tibble::rownames_to_column("gene_id") %>% 
+        inner_join(annotation) %>%
+        write.table(paste0(args[["--name"]], ".", species, ".length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
     
     if (is.character(args[["--deseq-formula"]])) {
       message(paste("DEseq formula:", args[["--deseq-formula"]]))

From 2ced2a2575039e16b1374d6abd4c06543a7ce516 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Mon, 8 Feb 2021 21:20:45 -0500
Subject: [PATCH 14/52] Allow extra_fastas remote URIs

---
 pisces/index.py | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index f1f3378..62184b6 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -61,12 +61,55 @@ def build_index(args, unknown_args):
 
             transcripts_fasta_file = os.path.join(
                 index_dir_path, "transcripts", "transcripts.fa")
+                
             with open(transcripts_fasta_file, 'w') as transcripts_fasta:
-                for fasta in dataset["extra_fastas"]:
-                    with open(fasta) as extra:
+                ## all of this URI handling should probably use an existing library like
+                ## https://github.com/intake/filesystem_spec
+                for fasta_loc in dataset["extra_fastas"]:
+                    fasta = urlparse(fasta_loc)
+                    if fasta.scheme == '':
+                        reference = Fasta(fasta.path)
+                    elif fasta.scheme.lower() in ('ftp', 'http', 'https'):
+                        _fasta_local_path = os.path.join(
+                            download_dir, os.path.basename(fasta.path))
+                        logging.info("Downloading %s", fasta.geturl())
+                        if not os.path.exists(_fasta_local_path):
+                            with urlopen(fasta.geturl()) as _fasta:
+                                with open(_fasta_local_path,
+                                          'wb') as _fasta_local:
+                                    copyfileobj(_fasta, _fasta_local)
+                                    _fasta_local.flush()
+                                    if fasta.path.endswith('gz'):
+                                        logging.info("Decompressing %s",
+                                                     fasta.geturl())
+                                        call(
+                                            ' '.join([
+                                                'gzip -dc', _fasta_local_path,
+                                                '>',
+                                                _fasta_local_path.replace(
+                                                    ".gz", "")
+                                            ]),
+                                            shell=True)
+                        if _fasta_local_path.endswith("2bit"):
+                            logging.info("Converting %s to FASTA format",
+                                         fasta.geturl())
+                            twobit = TwoBitFile(_fasta_local_path)
+                            if not os.path.exists(
+                                    _fasta_local_path.replace("2bit", "fa")):
+                                with open(
+                                        _fasta_local_path.replace(
+                                            "2bit", "fa"), 'w') as fasta:
+                                    for chrom in twobit.keys():
+                                        fasta.write(">%s\n" % chrom)
+                                        fasta.write(str(twobit[chrom]) + '\n')
+                            reference = Fasta(
+                                _fasta_local_path.replace("2bit", "fa"))
+                    
+                    with open(_fasta_local_path) as extra:
                         logging.info("Adding entries from %s", fasta)
                         for line in extra:
                             transcripts_fasta.write(line)
+                            
                 for gtf_loc, fasta_loc in zip(dataset["gtfs"],
                                               dataset["fastas"]):
                     gtf = urlparse(gtf_loc)

From 4665240b09224274fe6ad78aaa019bd0bf2a3e6e Mon Sep 17 00:00:00 2001
From: David Barrat <david@barrat.io>
Date: Tue, 16 Feb 2021 23:30:30 +0100
Subject: [PATCH 15/52] remove .DS_Store files

---
 .gitignore                  |   3 ++-
 docsource/.DS_Store         | Bin 10244 -> 0 bytes
 docsource/figures/.DS_Store | Bin 6148 -> 0 bytes
 pisces/.DS_Store            | Bin 10244 -> 0 bytes
 pisces/R/.DS_Store          | Bin 6148 -> 0 bytes
 pisces/scripts/.DS_Store    | Bin 6148 -> 0 bytes
 6 files changed, 2 insertions(+), 1 deletion(-)
 delete mode 100644 docsource/.DS_Store
 delete mode 100755 docsource/figures/.DS_Store
 delete mode 100755 pisces/.DS_Store
 delete mode 100755 pisces/R/.DS_Store
 delete mode 100755 pisces/scripts/.DS_Store

diff --git a/.gitignore b/.gitignore
index 7e99e36..0205d62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-*.pyc
\ No newline at end of file
+*.pyc
+.DS_Store
diff --git a/docsource/.DS_Store b/docsource/.DS_Store
deleted file mode 100644
index 4a8d67e8fe3fd6e3cafd8bd7a299769dbf39f185..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10244
zcmeHMYit}>6+Xvy?3vz9JZ_r!5j$DejpHO6vVO&Ok~(X9)5J;IJnSUS13SArQ)k3@
z$KBbr>$oW{v_Go0DN-Ty34I`;Dycw-6x4_Eqk^U_5<*xaEh;|%!4G~E1b+}haPGY`
zEAMV>1pG)8%}6t6?m74D+<WeK?sxB90KopN(Fza(0EKQY_1o!jor~M~U38Urn&l*s
z`~hq*64UNv!gdlxwF5;Efgl1w1cC?z5eOo1Gem$mn_mi(GI$0N2qF+fAQu5XKZNM!
zGU~}WDaF)553vOxSw?Dc(L3z}gipwmQBTfEDO~9{<?aDJSM;_RDBZ~(;?+q;Jvk?(
zbO)5~fZmtU+o7QNI{Ae}b-<{U!83?J5P{+d@V$FEj6njjU={4&KTJ&8mYa!0euX8a
zWy{Jd6h&c*rhIYej59ryiD$BIe>{DXmk!1=XA<7~u;bXd_jux{WloRjcMdqIOx#MD
zY2j=cCRI6g%t{%~bib2L8Ls%9A!$%mRZ++En%UX*NJlu-*4c3`9GdOwiiATQ?OV^C
zQ<ds<t@jL`o|rj1`|!C(o|pYXZ*G9ff_}bG*hzc5btvgfOp1EXUg2{|UJxdnSj;=b
z;Pc9r!3$UVit8BvvCia~1>booL_d-LovER8%63xYq~~o{APy<IZI}U<hQnQOm~t}=
zMq68FIMmu2;e*lLO#{=})%`yWMo~v?cWOWeyky2l-8CO|s>*J^clVybF<sBP=3&#0
zXRIkR7SEXOm|p1;{05Uw>R{TO5>Ts3#_eR5Dw%z@J!G9V-D63|&Q7G<k&O9JhHFl^
zPC7Fnlp_}Ta^0AI2lqcPnX#Odd(=$3BngKXwN<M_cW?MYd-py4^UEvsDs_dr^4OH+
zS_#`cGHn@|Gsoj@(o7jvYWxJ@H<hs-u*|etg%j#<Dn4NfFIL0q6un6q$tJ7^vsPwi
zzN%7RUf;mfQwBBlKF8o*hJIO5g{859v6I{~kT&UQD2h<8Yi8=OOU>2eUd4@@RCV}E
zlt{X(O4ltBl^wC;3DedMRl0R_s_fX5?32FFQ}%3A*$Ab<1U1>M!my|A-O1P}|MLF0
zn>jFPrl?g{>GQnrtby%Pl;hmmXBei@6m8+41`r)>ibiCKNLYbd2tzL%h7*t^R8PZG
z@HCu<XW&Kn4qSpCz`O7PT!oL}XYdR76n+h#!EfM?@MriZ{0jjQZ^KnsjWyVa&Deq)
za3i*3H*Ul2xC3|LKD-a_#{)Qmr!j#^G;ji^a0bufEIx+M;y3UDeiL88SMfExgqQJc
zd|$S$#GfVh%Ci5&pU~U&fax7)mW(b2%>Ek#%uh5vB6qB~bLFbFcW>C-*0rNAH<|_G
zmj|(hs1i_;SP>ZVF(PqWT~k}f>eV%iq5TM<&U{GW(9tXAMeRn`q!3zkMHJO$)}j)m
zb(Gd-7FHC!MB28nR+Uh!mr7fQx>X^(>dU0HN8PRxqV+Oq?Ny@+0aq`dudAu8?jKN9
z9e9GNiy&|b{JjPj;cdd+FX0*??sxDf!rb2pZxvW2@U|N3aSg5~%x%O?xEZ%%7xobD
zIO^`j-8hJcaR`s#QNmuFuxFr&<2Z>AqKg^A;Foa@zk*-I$MHOVjga_t!s55^^#b$_
z`nZ!4FI^9P1^iF}4-hMi6IG|d%wvbEJ~nR@Raei>cDA;MLxLN6+Iom5B5hqo<nWx=
zTR#U2nVK_sK`$3j!yA4tKh06YZH#Hl_YCf--n;Mr&O%Nm$+TnJ2OZbqEO0zW7)7R$
z|B6B?YZ~rq-W2KFwlg-*Q<kk{)r%!nK7}0eD1>`yby|Hsg^1r;t2Oc*%gb@`6>Y7y
zPEi)7yZm^Iwvi{E$bG5V%(n1k6H|b6ZDSog8|Bm>Rb8xyr=UMYNaYUJEAl@#^GJ1{
zHo&KWf@w(F)@r-+C`3X6AeYJcG|~VO=!c^)3X`-fdJ-<cx8a9y8Qy{)!TUrW^F$uM
zhtDaA{|bMjL@vQnT!!UXNz_q`JefCO8&O9GMk#go;(j8I!*~);;V6y?+DM7you;&X
zl+yMwd`ghV^Y{Y3PSo)xeh<HoSMVx+h#%oReujU*&t;o)RI=aigOvCelEpCgiRAoJ
zFpk~4F#g1+o9ddz=1p5VdU^+P!(K4bd0<?Kz@-VNwr+KO!y2YZVkul)`5B{E$i)h;
zeOHsF=^SdxJTI<q(U^{65ygw)7P6Ljwn!`4O2xW~+d8*0O<yKfNW8dhJ7ap8SVQsR
z&L~;S#ZrkEW4p*ogmZ`p#~<rryb|dwh1QRVa(+ul{Rb^?%8}s;vA9`J6w`$}u@7T{
zU=H9RqL*PJn9&7G8%JRDEIy2n2!eS6pTwt$V4lI}2&XUM%S18X#W(PxK<r!iV|)kS
z#rN>1_;cCb99H-F?Uxd-ku1P!0qx{F1f^Rg0;RGeA%6b9Yw`d8Z<X!_{RkouMBq9R
zz_Qp-te*`2s=TlcKOYOxeVT55@p^Mo3Kx2a<9L$dIGzgp*i0M^kj^_45Pnh8bN@Lh
Z>5Z-%^JhS?|2K`Lmt(>H|JL3A{~H>MYmfi{

diff --git a/docsource/figures/.DS_Store b/docsource/figures/.DS_Store
deleted file mode 100755
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I

diff --git a/pisces/.DS_Store b/pisces/.DS_Store
deleted file mode 100755
index 26a432d6babc6f86d9891b80e76c85f719043dd6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10244
zcmeI1TWl0n7{||73Y}qR+9{Mvp(QJgQlyyejdD}A+m<`hw51nr+u5Cw4o+v9o#_@>
zt3?x{M2U(I3K|6g4UqUEg1mTv2n1i$CWsMV4C0f%XhdK1J7;Fw?UW{HY68hjGV`6=
zcjlbm{%5{9X9*z?=~c=J2@*m$JgJogyxifS_4FKdWVt0GT0nmi>$7@dx*5a#AER3P
z1wH|vfKR|D;1lo(+zSLSX48WUIM%=V1bhNMfgu9){Sd^H+OTT}9LruEcwtKbwAon9
z4opilfi~#cuxke#i#2$F<BHaC1$T)7jyskI-F9Ku4mj3vC*UqVz`ZlLD->XN$Nqz?
zIf1Za{i{#FCoq}-UAt!!g~-Gva_0Iy)}w2d9SU8CtO?l@C*^SbWIS>wOZmCmn!C)t
zW?Qy<t%#i1!g89T#@uwHY<H#8EvBgtrR7+wruIdpf;!W%WzA3%8LnH~G($1_B4)3l
zSX-N&{@OIf?uts&8WU;~T^nUXZPhF-rs<l!o5zFWqtf(LDpVG%sw(d<3#MvnXu5Jy
zf4_8|<7UsVuHDel)4%`dyT^{7c<=qsE;t0heT$H(9wHYrap+dE(b>+(I-SPx-7t=T
zAm%o#Tvf1Ih)PrF@HY0?nrT?AYQjPrmGUg)v?gvE?(m~heq7e$y}E3xwYuJ{?NP0^
zxT*Jc8y0OHPnf#iXj&TWrlZ|fciFVfsH*nOmYJ~Ym}9$!{X4CsUQ2CKv2!h{*5fTK
ziSXRXyqU!hJhUKGz2vb$DSyh;f@y-V8Q~hX_JpP;HuX`$9kLZy4Mj6{v?VECU2QSs
zZZ#juc%FF`&Ju;K3ii3dRH&aKo#C9CIkQA@3$3Y3sCZR4jycRN5rq~D2QSHP!+E8G
z(DEey#**cBTDDLSH|cUr)g^^@+{!Biu`TKFlh(O~i>3);d)(~q#z9+d+0tdAxSig1
zy=>VVdsG95beZ0JMYuMCLmF#d9n_~*QRqNR(!YI4xYS9Ogx%;-5)L^fJcgxWQbFoT
zJLx0|k|HmW*U1s`A^C`$CFjTm@+G-OZjf8#SMnQfdfg^}LKfsfK1_kBPz0qgA0CEE
zSPV;H87zkeSPSc613V6GunnTn2{P=21lZ6ENq7dHhZkT!9E78A435JI_yEqpCvXYA
zg0JBl_#SS;FW_#iWu-N|Ha!o=!fmuek2Gr7-IB|CZ43>sT(x@ontNHp*Yh0ZPM<N8
zNqOmt$dDd9BFK~?)1M<SCcLz;n6^i#d9ZNS?4sg1C`Z(%x@#=)NICS0;8<%ExF~T<
zbJTfWDQ2_qsc>q`7K(y2foY%CQdvtAnaXK)QH>x-+3Y!>*=5VIb`q02%?9QO0zM%$
zSyzhP0LPDqQm>Lh@*`8}U*vDd0RakOF6#3U)MgkWsLYkH3L2paT3{2jqBdoSg90iT
z(1QxKP^0~*(7o_1JO?kr0eBf+fmh)$yajK=J8%R}!YMe7dOZv0;A8k4uEHSPa5Ooj
z)pZ`N{*bBFWOKsMO=Aa^QEdZh$_Plmr`7xI&>NT!@FJhb-ys~iAt)*pWd@QK)o9Sw
zq-tzTtjE`Y6S@9lgvyM1JYN#-`V>b@@P&&3X=W<5q#VOg`QoaPA&KV-=T$as-?{g|
zp~G*|c;pR7`K*)S3k^qz!2@OzzKB$e*&4Bkoq^~|8qu{hB904iA~!Z5Hj8l~ke+yD
zWldSIvNGhI_>2e3A6?Vgy?amUss5+maAr6Zh4Y4I%u<K&I%1L@0694EI6}}!4}P{-
z!w8@#jbKEa4&(+TAvX{h3F3cL$6bBDWJ-voUDfi^c}BA^m8z~NM;V1GJUZg=!>4$`
z+Sb_aeFsmSy>Ri1OP8<Q{OOj5YJrF9&zSd+^Kj*nYXUz7qexYy;>u)8?g@E?s>*^D
z)gB%^_pjx3hpBpCOdN*bsqt|2vpxZzfKTAQBQU`kaggr+H;n!N|9v+vf51KgpTOM{
zfb9C_`Utj9?<ja(>0Wyf&rUq)LE8;Dmbu_%#2*2eq1*9nR|V_by%gpb#&YUE;8^C4
jdFGBbn--&`?$kQ_G2{McK)33z|NZs<_^p2L!}b3^2i(r&

diff --git a/pisces/R/.DS_Store b/pisces/R/.DS_Store
deleted file mode 100755
index b89866d200cf60844e8d2b82f5c72bcb9520f3ce..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKJB|V|47C9XB$_F=%oVsngxC{sfnAtxkwAR<sW=x$+viUK8nzn>R?m|262~u5
zUNIhvh%PVfLZlIq8Qf5=HuTN*&3iV=hyvj_<0#*9Zcp3YX5Y(x9x(1y4&ZI&=bpdc
z*%ploPys4H1*iZO7+8TUu(SEV=khozKn31k0sB4_xM5A~0{zp0!CL^}1YtMKy_WzM
z3jk|k7l;T<g9;3)W{aUgN4#WSP3!`LE}G4U=FOTNiu&z1zj(T64dh4#sKBEF<5)IU
z|3Be>^#6|}uBZSN_*V+(WVK!`@uaM+y~kOtE$}b6<=o(Am^%f7mt&xpV=Sy3e|S>l
a6`NzfCU${NN8IT^{tTEdG%E0F1%3g&>lKUu

diff --git a/pisces/scripts/.DS_Store b/pisces/scripts/.DS_Store
deleted file mode 100755
index 9533b8ae2b6f4247cc7b31aa98079224189c678c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKJx{|h5PgOYl~^h!#{2=OU|^1_upt)245d_1sg#fpB)0r?-t!qmM21dPh0fCX
zo$q|-^Q&sd0K%+&ItL~I#uQ<YGSFlmbRDG0!Vq)3V}X0TVu|@cqJ1=}_G3J=;)a#~
zuD?c$iuM|3DDhb~kJoM8RGfc_J6f0Y)|gUWu%f=^zUR20rJ(PFvc?-D#m0u&ryOm_
zPl;2?ig62il%YS8y`sNljJ(^|ca!SdYCrq<V|Y!@fHU9>I0HLjfOodq_{h+EXTTY7
z27VaO`H)eBdBV;x9v$rB6M#6PIX3EgODHBwm?!KE>0t{NO0>|Bo*2QxY0uKUJYi>O
z;RtbM=1D7mKVCvyo%SqFN5~DmcLtn+J_Cm~oap|);$LR)k>3yTku%^7{4)l^sJJa=
z+*G<-|7=fpZAN*ZsA*m&g^haeBY*>)BPY>m{!BXO<q11OWsyCG6a7OV65^dRumc9Z
E0BI{emH+?%


From 14de666b8364b0e84b578c723e9647b2c08beba1 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Thu, 4 Mar 2021 14:13:45 -0500
Subject: [PATCH 16/52] Separate package installation from R/binary dependency
 management

---
 README.md          |  2 +-
 pisces/__init__.py | 37 +++++++++++++++++++++++++++++++++++-
 pisces/cli.py      | 17 -----------------
 setup.py           | 47 +---------------------------------------------
 4 files changed, 38 insertions(+), 65 deletions(-)

diff --git a/README.md b/README.md
index 5e2b8f9..6f64e15 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Matthew D Shirley, Viveksagar K Radhakrishna, Javad Golji, Joshua M Korn. (Prepr
 Installation:
 ```
 $ pip install --user novartis-pisces
-$ pisces dependencies
+$ pisces_dependencies
 ```
 
 Submitting jobs to an HPC cluster:
diff --git a/pisces/__init__.py b/pisces/__init__.py
index 6d7c300..8d62550 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -22,11 +22,46 @@
 
 unique_id = ''.join(random.choice(string.digits) for _ in range(10))    
 
-
 def find_data_directory():
     """ Returns the path to the module directory """
     return os.path.dirname(os.path.abspath(__file__))
 
+def install_salmon():
+    """Install a version of salmon from bioconda"""
+    import glob
+    import requests
+    from io import BytesIO
+    from urllib.request import urlopen
+    from shutil import rmtree
+    from subprocess import call
+    
+    redist = os.path.join(find_data_directory(), 'redist')
+    rmtree(os.path.join(redist, "salmon"), ignore_errors=True)
+   
+    if platform.system() == "Linux":
+        salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/linux-64/salmon-1.3.0-hf69c8f4_0.tar.bz2"
+    elif platform.system() == "Darwin":
+        salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/osx-64/salmon-1.3.0-hb70dc8d_0.tar.bz2"
+
+    print("Installing salmon")
+    with requests.get(salmon_url, allow_redirects=True) as response:
+        tar_file = BytesIO(response.content)
+        tar_file.seek(0)
+        with tarfile.open(fileobj=tar_file) as tar:
+            tar.extractall(path=os.path.join(redist, "salmon"))
+            
+def install_r_dependencies():
+    """Install R dependencies"""
+    cmd = [
+        'Rscript',
+        os.path.join(find_data_directory(), 'R/set_up_dependencies.R')
+    ]
+    call(cmd)
+    
+def install_dependencies():
+    install_salmon()
+    install_r_dependencies()
+
 def sra_valid_accession(accession):
     """ Test whether a string is an SRA accession """
     if accession.startswith('SRR') and len(accession) == 10:
diff --git a/pisces/cli.py b/pisces/cli.py
index 3dabd81..518bdb0 100755
--- a/pisces/cli.py
+++ b/pisces/cli.py
@@ -59,16 +59,6 @@ def call_Rscript(args, unknown_args):
         ]
         cmd.extend(unknown_args)
         call(cmd)
-        
-def call_Rscript_dependencies(args, unknown_args):
-    """ Call an R script, passing through arguments from argparse. """
-    cmd = [
-        'Rscript',
-        os.path.join(data_dir, 'R/set_up_dependencies.R')
-    ]
-    cmd.extend(unknown_args)
-    call(cmd)
-
 
 def default_species_index(conf):
     """ Return a list of the default index builds for each species defined in args.conf """
@@ -249,13 +239,6 @@ def create_parser(args=None):
         "Compile an expression matrix from multiple individual PISCES runs",
         add_help=False)
     matrix.set_defaults(func=call_Rscript)
-    
-    dependencies = subparsers.add_parser(
-        'dependencies',
-        help=
-        "Install R dependencies for PISCES",
-        add_help=False)
-    dependencies.set_defaults(func=call_Rscript_dependencies)
 
     qc = subparsers.add_parser(
         'summarize-qc',
diff --git a/setup.py b/setup.py
index 79a07dc..b5d3bdd 100644
--- a/setup.py
+++ b/setup.py
@@ -6,58 +6,13 @@
 import logging
 import platform
 from setuptools import setup
-from setuptools.command.develop import develop
-from setuptools.command.install import install
 
 install_requires = [
     'six', 'setuptools >= 0.7', 'simplesam >= 0.0.3', 'tqdm', 'fastqp >= 0.2',
     'pandas', 'strandex', 'gffutils', 'pyfaidx', 'drmaa', 'twobitreader', 'requests'
 ]
 
-
-class PostDevelopCommand(develop):
-    def run(self):
-        develop.run(self)
-        install_binary_dependencies()
-
-
-class PostInstallCommand(install):
-    def run(self):
-        install.run(self)
-        install_binary_dependencies()
-
-
-def install_binary_dependencies():
-    sys.path.pop(0)  # ignore local search path
-    import pisces
-    import glob
-    import requests
-    from io import BytesIO
-    from urllib.request import urlopen
-    from shutil import rmtree
-    from subprocess import call
-    
-    redist = os.path.join(os.path.dirname(pisces.__file__), 'redist')
-    rmtree(os.path.join(redist, "salmon"), ignore_errors=True)
-    local_redist = os.path.join(os.path.dirname(__file__), 'pisces/redist')
-   
-    if platform.system() == "Linux":
-        salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/linux-64/salmon-1.3.0-hf69c8f4_0.tar.bz2"
-    elif platform.system() == "Darwin":
-        salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/osx-64/salmon-1.3.0-hb70dc8d_0.tar.bz2"
-
-    print("Installing salmon")
-    with requests.get(salmon_url, allow_redirects=True) as response:
-        tar_file = BytesIO(response.content)
-        tar_file.seek(0)
-        with tarfile.open(fileobj=tar_file) as tar:
-            tar.extractall(path=os.path.join(redist, "salmon"))
-
 setup(
-    cmdclass={
-        'develop': PostDevelopCommand,
-        'install': PostInstallCommand,
-    },
     name='novartis-pisces',
     author='Matthew Shirley',
     author_email='matt_d.shirley@novartis.com',
@@ -70,7 +25,7 @@ def install_binary_dependencies():
     install_requires=install_requires,
     use_scm_version={"local_scheme": "no-local-version"},
     setup_requires=['setuptools_scm'],
-    entry_points={'console_scripts': ['pisces = pisces.cli:main']},
+    entry_points={'console_scripts': ['pisces = pisces.cli:main', 'pisces_dependencies = pisces:install_dependencies']},
     classifiers=[
         "Development Status :: 4 - Beta",
         "License :: OSI Approved :: Apache Software License",

From e0177a7571ad99e84470d807d4dc6e8b02e102d3 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Thu, 4 Mar 2021 14:35:14 -0500
Subject: [PATCH 17/52] missing import

---
 pisces/__init__.py | 1 +
 setup.py           | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index 8d62550..f0b38e3 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -11,6 +11,7 @@
 import string
 import random
 import shutil
+import platform
 from pprint import pformat
 from subprocess import Popen, PIPE
 from multiprocessing import Process
diff --git a/setup.py b/setup.py
index b5d3bdd..53238d5 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,6 @@
 import zipfile
 import subprocess
 import logging
-import platform
 from setuptools import setup
 
 install_requires = [

From dea7a0b9419e14a09efe3634bb2650146ef6a6ba Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Thu, 4 Mar 2021 19:35:56 -0500
Subject: [PATCH 18/52] Move imports

---
 pisces/__init__.py | 1 +
 setup.py           | 6 ------
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index f0b38e3..6ce91e4 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -12,6 +12,7 @@
 import random
 import shutil
 import platform
+import tarfile
 from pprint import pformat
 from subprocess import Popen, PIPE
 from multiprocessing import Process
diff --git a/setup.py b/setup.py
index 53238d5..32e0329 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,3 @@
-import os
-import sys
-import tarfile
-import zipfile
-import subprocess
-import logging
 from setuptools import setup
 
 install_requires = [

From 47335dd8a43d876c287a0a07780386f0c68cf95d Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Thu, 4 Mar 2021 19:51:37 -0500
Subject: [PATCH 19/52] import call

---
 pisces/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index 6ce91e4..2858ec3 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -14,7 +14,7 @@
 import platform
 import tarfile
 from pprint import pformat
-from subprocess import Popen, PIPE
+from subprocess import Popen, PIPE, call
 from multiprocessing import Process
 from tempfile import NamedTemporaryFile, mkdtemp
 from functools import partial

From 0e44850d0931a66c9f9f3cc0191723b43d80bd8d Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 24 Mar 2021 10:49:22 -0400
Subject: [PATCH 20/52] Negate filtering

---
 pisces/R/make_expression_matrix.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R
index 369b302..c071679 100755
--- a/pisces/R/make_expression_matrix.R
+++ b/pisces/R/make_expression_matrix.R
@@ -389,7 +389,7 @@ deseq_analysis <- function(contrasts, txi.gene, contrast.metadata, formula, fail
     } else {
     deseq.dataset <- DESeqDataSetFromTximport(txi.gene, contrast.metadata, as.formula(paste0("~", contrasts$Factor[1])))}
   message(paste("Filtering", length(failing.quartile.filter), "genes failing --quartile-expression cutoff from DESeq2 dataset."))
-  deseq.dataset <- deseq.dataset[failing.quartile.filter,]
+  deseq.dataset <- deseq.dataset[-failing.quartile.filter,]
   message("Running DESeq2...")
   
   deseq.dataset <- DESeq(deseq.dataset)

From f321fc1456cca931ff1f01c9bd1a8a92d16b9e14 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 24 Mar 2021 10:54:48 -0400
Subject: [PATCH 21/52] Better document first_quartile

---
 pisces/R/make_expression_matrix.R | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R
index c071679..e4155fb 100755
--- a/pisces/R/make_expression_matrix.R
+++ b/pisces/R/make_expression_matrix.R
@@ -269,7 +269,14 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
       "genes."))
     ribo.genes <- which(grepl("^RP[SL]", scaled_df[, "symbol"], ignore.case = T))
     
-    first_quartile <- function(x) { y <- quantile(x, c(0.25, 0.5, 0.75), type=1)
+
+
+    first_quartile <- function(x) { 
+        # > seq(10)
+        # [1]  1  2  3  4  5  6  7  8  9 10
+        # > first_quartile(seq(10))
+        #[1] 3
+        y <- quantile(x, c(0.25, 0.5, 0.75), type=1) 
         return(y[[1]])
         }
     

From bfc6805d9e66a03d9d0ea82ab15695b43756c342 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 24 Mar 2021 11:01:40 -0400
Subject: [PATCH 22/52] Take the top quantile instead of bottom

---
 pisces/R/make_expression_matrix.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R
index e4155fb..b7dda00 100755
--- a/pisces/R/make_expression_matrix.R
+++ b/pisces/R/make_expression_matrix.R
@@ -275,9 +275,9 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
         # > seq(10)
         # [1]  1  2  3  4  5  6  7  8  9 10
         # > first_quartile(seq(10))
-        #[1] 3
+        #[1] 8
         y <- quantile(x, c(0.25, 0.5, 0.75), type=1) 
-        return(y[[1]])
+        return(y[[3]])
         }
     
     message(paste("Excluding", length(ribo.genes), "ribosomal genes from TMM scaling factor calulation."))

From ab6760a1e72604a1dca873375bac81646c772a08 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 24 Mar 2021 11:09:29 -0400
Subject: [PATCH 23/52] function doc

---
 pisces/R/make_expression_matrix.R | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R
index b7dda00..4f74ca4 100755
--- a/pisces/R/make_expression_matrix.R
+++ b/pisces/R/make_expression_matrix.R
@@ -272,10 +272,8 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
 
 
     first_quartile <- function(x) { 
-        # > seq(10)
-        # [1]  1  2  3  4  5  6  7  8  9 10
-        # > first_quartile(seq(10))
-        #[1] 8
+        # > first_quartile(c(0,0,0,0,0,0,0,1,10,10))
+        # [1] 1
         y <- quantile(x, c(0.25, 0.5, 0.75), type=1) 
         return(y[[3]])
         }

From 89aadee0b196c569729358fc7b8300efbab6d4c5 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Thu, 15 Apr 2021 10:50:55 -0400
Subject: [PATCH 24/52] Fix exon ordering for minus strand transcripts in #15

---
 pisces/index.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pisces/index.py b/pisces/index.py
index 62184b6..5d2dc0f 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -266,10 +266,16 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                         """ 
                         """
                         sequences = []
+                        feature_strand = "."
                         for feature in features:
+                            feature_strand = feature.strand
                             sequences.append(
                                 feature.sequence(
                                     fasta_in, use_strand=strand))
+                        # if the transcript is on the reverse strand, reverse order of exons 
+                        # before concatenating
+                        if feature_strand == "-":
+                            sequences = sequences[::-1]
                         seq = ''.join(sequences)
                         mask_count = sum(seq.count(a) for a in soft_chars)
                         if masked:

From f0a0f0f3a7c07e767cd3b1006ae2fa5c7116c17d Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Fri, 9 Jul 2021 14:44:43 -0400
Subject: [PATCH 25/52] Enable selective alignment and gcBias flags

---
 pisces/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index 2858ec3..b41e8d2 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -271,7 +271,7 @@ def format_salmon_command(libtype, threads, index, output_dir, read1, read2,
                          'bin', 'salmon'), '--no-version-check', 'quant', '-q',
             '--index', index, '--libType', libtype, '--mates1', ' '.join(read1),
             '--mates2', ' '.join(read2), '--output', output_dir, '--threads',
-            str(threads), '--seqBias', '--useVBOpt'
+            str(threads), '--seqBias', '--gcBias', '--validateMappings', '--dumpEq'
         ]
     elif not read2:
         cmd = [
@@ -279,7 +279,7 @@ def format_salmon_command(libtype, threads, index, output_dir, read1, read2,
                          'bin', 'salmon'), '--no-version-check', 'quant', '-q',
             '--index', index, '--libType', libtype, '-r', ' '.join(read1), '--output',
             output_dir, '--threads',
-            str(threads), '--seqBias', '--useVBOpt'
+            str(threads), '--seqBias', '--gcBias', '--validateMappings', '--dumpEq'
         ]
     if make_bam:
         import shlex

From 4b62d5f74e7d1c21d7bf6ef51df34ae446b61c5f Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 1 Sep 2021 21:32:15 -0400
Subject: [PATCH 26/52] Add gene-level effective lengths (#6)

---
 pisces/R/make_expression_matrix.R | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R
index 4f74ca4..6067823 100755
--- a/pisces/R/make_expression_matrix.R
+++ b/pisces/R/make_expression_matrix.R
@@ -239,11 +239,17 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
     message("Writing annotation data matrix...")
     write.table(as.data.frame(annotation), paste0(args[["--name"]], ".", species, 
       ".annotation.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
+      
+    message("Making gene-level length matrix...")  
+    as.data.frame(txi.gene$abundance) %>% tibble::rownames_to_column("gene_id") %>% 
+      left_join(annotation) %>% as.data.frame %>%
+      write.table(format(., nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], 
+      ".", species, ".length.txt"), quote = FALSE, sep = "\t", row.names = F, 
+      col.names = T)
     
-    message("Making raw TPM matrix...")
+    message("Making gene-level TPM matrix...")
     raw_df <- as.data.frame(txi.gene$abundance) %>% tibble::rownames_to_column("gene_id") %>% 
       left_join(annotation) %>% as.data.frame
-    
    
     if (!args[["--no-rescale"]]) {
       message(paste("Scaling TPM of", args[["--scale-tpm"]], "to 1e6..."))
@@ -263,6 +269,7 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
     write.table(format(normed_df, nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], 
       ".", species, ".log2fc.txt"), quote = FALSE, sep = "\t", row.names = F, 
       col.names = T)
+
     
     
     message(paste("Calculating TMM scaling factors using", args[["--scale-tpm"]], 

From ea86b2c1993cb02b1647a98ee9745483614579e9 Mon Sep 17 00:00:00 2001
From: Peter Skewes-Cox <ucpete@gmail.com>
Date: Thu, 2 Sep 2021 18:16:10 -0700
Subject: [PATCH 27/52] Fixes auto sample naming

Proposed fix for #19 - could use some testing / review.
---
 pisces/run.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pisces/run.py b/pisces/run.py
index 2adc886..2ba3d2f 100644
--- a/pisces/run.py
+++ b/pisces/run.py
@@ -62,12 +62,15 @@ def run(args, unknown_args):
     # set up the default sample name
     if not args.name:
         if args.fq2:
-            sample_name = long_substr(tuple(chain(args.fq1, args.fq2)))
+            sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq2.split())))
         else:
             sample_name = long_substr(tuple(chain(args.fq1, args.fq1)))
         sample_name = sample_name.split('/')[-1].split('_')[0]
         logging.info("Sample name: %s", sample_name)
-        assert len(sample_name) > 0
+        try:
+            assert len(sample_name) > 0
+        except AssertionError:
+            raise RuntimeError("Output file naming failed; provide an output file basename with `--name`.")
     else:
         sample_name = args.name
 

From 63ac08d7764a68f70abe63ce62ab01fb7b13b027 Mon Sep 17 00:00:00 2001
From: Peter Skewes-Cox <ucpete@gmail.com>
Date: Tue, 7 Sep 2021 13:51:59 -0700
Subject: [PATCH 28/52] Added split for single read as well

---
 pisces/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pisces/run.py b/pisces/run.py
index 2ba3d2f..3736a2a 100644
--- a/pisces/run.py
+++ b/pisces/run.py
@@ -64,7 +64,7 @@ def run(args, unknown_args):
         if args.fq2:
             sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq2.split())))
         else:
-            sample_name = long_substr(tuple(chain(args.fq1, args.fq1)))
+            sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq1.split())))
         sample_name = sample_name.split('/')[-1].split('_')[0]
         logging.info("Sample name: %s", sample_name)
         try:

From f0c4508d4fba3c93d202f4c49ead7c49c1843945 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 8 Sep 2021 11:36:07 -0400
Subject: [PATCH 29/52] Fix gene-level lengths output

---
 pisces/R/make_expression_matrix.R | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R
index 6067823..27b5ba6 100755
--- a/pisces/R/make_expression_matrix.R
+++ b/pisces/R/make_expression_matrix.R
@@ -241,10 +241,9 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
       ".annotation.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
       
     message("Making gene-level length matrix...")  
-    as.data.frame(txi.gene$abundance) %>% tibble::rownames_to_column("gene_id") %>% 
-      left_join(annotation) %>% as.data.frame %>%
-      write.table(format(., nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], 
-      ".", species, ".length.txt"), quote = FALSE, sep = "\t", row.names = F, 
+    as.data.frame(txi.gene$length) %>% tibble::rownames_to_column("gene_id") %>% 
+      left_join(annotation) %>% as.data.frame %>% format(nsmall = 3, scientific = F, trim = T) %>%
+      write.table(paste0(args[["--name"]], ".", species, ".length.txt"), quote = FALSE, sep = "\t", row.names = F, 
       col.names = T)
     
     message("Making gene-level TPM matrix...")

From 9e646770d4ff670cda567b67ecd1efc596b653c2 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Tue, 2 Nov 2021 11:05:05 -0400
Subject: [PATCH 30/52] Don't split fq path lists

args.fq1 and args.fq2 are lists and not strings
---
 pisces/run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pisces/run.py b/pisces/run.py
index 3736a2a..e3ce4f2 100644
--- a/pisces/run.py
+++ b/pisces/run.py
@@ -62,9 +62,9 @@ def run(args, unknown_args):
     # set up the default sample name
     if not args.name:
         if args.fq2:
-            sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq2.split())))
+            sample_name = long_substr(tuple(chain(args.fq1, args.fq2)))
         else:
-            sample_name = long_substr(tuple(chain(args.fq1.split(), args.fq1.split())))
+            sample_name = long_substr(tuple(chain(args.fq1, args.fq1)))
         sample_name = sample_name.split('/')[-1].split('_')[0]
         logging.info("Sample name: %s", sample_name)
         try:

From 8d00f98f69667ab452933bfc53e74ae7504ab69e Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 3 Nov 2021 14:31:02 -0400
Subject: [PATCH 31/52] Don't drop dimensions for matrices with only one sample
 (#24)

---
 pisces/R/make_expression_matrix.R | 44 ++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R
index 27b5ba6..abfb26f 100755
--- a/pisces/R/make_expression_matrix.R
+++ b/pisces/R/make_expression_matrix.R
@@ -126,11 +126,17 @@ NormalizeSamples <- function(abundances, metadata, args) {
           experimentals)] + 1) - log2(apply(abundances[, c(controls)] + 1, 
           1, median))
       }
+    } else if (ncol(abundances) == 1) {
+    # only one sample
+    abundances <- log2(abundances + 1) - log2(abundances + 1)
     } else {
       # normalize to median of all samples
       message("Normalizing to median of all samples.")
       abundances <- log2(abundances + 1) - log2(apply(abundances + 1, 1, median))
     }
+  } else if (ncol(abundances) == 1) {
+    # only one sample
+    abundances <- log2(abundances + 1) - log2(abundances + 1)
   } else {
     # no metadata file
     message("Normalizing to median of all samples.")
@@ -139,10 +145,10 @@ NormalizeSamples <- function(abundances, metadata, args) {
   return(abundances)
 }
 
-rescaleTPM <- function(df, columns = colnames(df), rows = 1:nrow(df), precision = 3) {
-    sums.before.norm <- apply(df[rows, columns], 2, sum)
+rescaleTPM <- function(mat, columns = colnames(mat), rows = 1:nrow(mat), precision = 3) {
+    sums.before.norm <- apply(mat[rows, columns, drop=F], 2, sum)
     tpm.scale.factors <- 1e+06/sums.before.norm
-    scaled_df <- df
+    scaled_df <- mat
     scaled_df[rows, columns] <- data.frame(mapply("*", scaled_df[rows, 
         columns], tpm.scale.factors))
     scaled_df[, columns] <- round(scaled_df[, columns], precision)
@@ -221,9 +227,9 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
         write.table(paste0(args[["--name"]], ".", species, ".intergenes.length.txt"), quote = FALSE, sep = "\t", row.names = F, col.names = T)
         
     message("Building transcript QC measure table...")
-    pct_intronic <- colSums(txi$counts[intron.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100
-    pct_intergenic <- colSums(txi$counts[intergene.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100
-    pct_genic <- colSums(txi$counts[tx.idx,]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),]) * 100
+    pct_intronic <- colSums(txi$counts[intron.idx,,drop=F]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),,drop=F]) * 100
+    pct_intergenic <- colSums(txi$counts[intergene.idx,,drop=F]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),,drop=F]) * 100
+    pct_genic <- colSums(txi$counts[tx.idx,,drop=F]) / colSums(txi$counts[c(tx.idx, intron.idx, intergene.idx),,drop=F]) * 100
     
     rbind(pct_genic, pct_intronic, pct_intergenic) -> qc_table
     rownames(qc_table) <- c("exonic", "intronic", "intergenic")
@@ -261,9 +267,9 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
       ".", species, ".TPM.txt"), quote = FALSE, sep = "\t", row.names = F, 
       col.names = T)
     normed_df <- scaled_df
-    normed_df[, abundance.cols] <- NormalizeSamples(normed_df[, abundance.cols], 
+    normed_df[, abundance.cols] <- NormalizeSamples(normed_df[, abundance.cols, drop=F], 
       metadata, args)
-    normed_df[, abundance.cols] <- round(normed_df[, abundance.cols], 3)
+    normed_df[, abundance.cols] <- round(normed_df[, abundance.cols, drop=F], 3)
     message("Making log2 fold change matrix...")
     write.table(format(normed_df, nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], 
       ".", species, ".log2fc.txt"), quote = FALSE, sep = "\t", row.names = F, 
@@ -285,7 +291,7 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
         }
     
     message(paste("Excluding", length(ribo.genes), "ribosomal genes from TMM scaling factor calulation."))
-    failing.quartile.filter <- which(apply(scaled_df[, abundance.cols], 1, first_quartile) < 
+    failing.quartile.filter <- which(apply(scaled_df[, abundance.cols, drop=F], 1, first_quartile) < 
       as.numeric(args[["quartile-expression"]]))
     message(paste(length(failing.quartile.filter), "genes failing --quartile-expression cutoff"))
     if (is.character(args[["--exclude-genes"]])) {
@@ -309,26 +315,26 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) {
     tmm.genes <- setdiff(tmm.genes, exclusion.list)
     
     
-    sums.before.norm <- apply(raw_df[tmm.genes, abundance.cols], 2, sum)
+    sums.before.norm <- apply(raw_df[tmm.genes, abundance.cols, drop=F], 2, sum)
     tpm.scale.factors <- 1e+06/sums.before.norm
     scaled_df <- raw_df
     if (!args[["--no-rescale"]]) {
       scaled_df[tmm.genes, abundance.cols] <- data.frame(mapply("*", scaled_df[tmm.genes, 
-        abundance.cols], tpm.scale.factors))
+        abundance.cols, drop=F], tpm.scale.factors))
     }
     message(paste("Using", length(tmm.genes), "genes for TMM factor calculation"))
-    tmm.factors <- calcNormFactors(scaled_df[tmm.genes, abundance.cols])
+    tmm.factors <- calcNormFactors(scaled_df[tmm.genes, abundance.cols, drop=F])
     tmm_df <- scaled_df
     tmm_df[to.rescale, abundance.cols] <- data.frame(mapply("/", tmm_df[to.rescale, 
-      abundance.cols], tmm.factors))
-    tmm_df[, abundance.cols] <- round(tmm_df[, abundance.cols], 3)
+      abundance.cols, drop=F], tmm.factors))
+    tmm_df[, abundance.cols] <- round(tmm_df[, abundance.cols, drop=F], 3)
     write.table(format(tmm_df, nsmall = 3, scientific = F, trim = T), paste0(args[["--name"]], 
       ".", species, ".TPM.TMM-scaled.txt"), quote = FALSE, sep = "\t", row.names = F, 
       col.names = T)
     normed_tmm_df <- tmm_df
-    normed_tmm_df[, abundance.cols] <- NormalizeSamples(normed_tmm_df[, abundance.cols], 
+    normed_tmm_df[, abundance.cols] <- NormalizeSamples(normed_tmm_df[, abundance.cols, drop=F], 
       metadata, args)
-    normed_tmm_df[, abundance.cols] <- round(normed_tmm_df[, abundance.cols], 
+    normed_tmm_df[, abundance.cols] <- round(normed_tmm_df[, abundance.cols, drop=F], 
       3)
     write.table(format(normed_tmm_df, nsmall = 3, scientific = F, trim = T), 
       paste0(args[["--name"]], ".", species, ".log2fc.TMM-scaled.txt"), quote = FALSE, 
@@ -531,9 +537,9 @@ min_length <- as.numeric(args[["--min-transcript-length"]])
 min_idx <- which(apply(txi.salmon.isoform$length, 1, max) > min_length)
 message(paste("Removing", length(which(apply(txi.salmon.isoform$length, 1, max) <= 
   min_length)), "transcripts less than", min_length, "bp."))
-txi.salmon.isoform$abundance <- txi.salmon.isoform$abundance[min_idx, ]
-txi.salmon.isoform$counts <- txi.salmon.isoform$counts[min_idx, ]
-txi.salmon.isoform$length <- txi.salmon.isoform$length[min_idx, ]
+txi.salmon.isoform$abundance <- txi.salmon.isoform$abundance[min_idx,,drop=F]
+txi.salmon.isoform$counts <- txi.salmon.isoform$counts[min_idx,,drop=F]
+txi.salmon.isoform$length <- txi.salmon.isoform$length[min_idx,,drop=F]
 
 for (assembly in this.config[["fastas"]]) {
   message("Reading gene and transcript annotations.")

From 051a474e9ad8845a3391c940b5aba9da7c4d7c40 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 17 Nov 2021 13:41:46 -0500
Subject: [PATCH 32/52] Allow user-defined GTF gene name and type tags

---
 pisces/config.json | 12 +++++++++---
 pisces/index.py    | 14 ++++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/pisces/config.json b/pisces/config.json
index d3f9873..2e49e3f 100755
--- a/pisces/config.json
+++ b/pisces/config.json
@@ -10,7 +10,9 @@
                 "extra_fastas": [],
                 "options": {
                     "-k": 31,
-                    "infer_features": false
+                    "infer_features": false,
+                    "gtf_name_tag": "gene_name",
+                    "gtf_type_tag": "gene_type"
                 }
             }
         }
@@ -26,7 +28,9 @@
                 "extra_fastas": [],
                 "options": {
                     "-k": 31,
-                    "infer_features": false
+                    "infer_features": false,
+                    "gtf_name_tag": "gene_name",
+                    "gtf_type_tag": "gene_type"
                 }
             }
         }
@@ -44,7 +48,9 @@
                 "extra_fastas": [],
                 "options": {
                     "-k": 31,
-                    "infer_features": false
+                    "infer_features": false,
+                    "gtf_name_tag": "gene_name",
+                    "gtf_type_tag": "gene_type"
                 }
             }
         }
diff --git a/pisces/index.py b/pisces/index.py
index 5d2dc0f..ad7812b 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -320,12 +320,22 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         featuretype='transcript',
                                         order_by='start'))
                                 try:
-                                    gene_type = first_transcript['gene_type'][0]
+                                    if options["gene_type"] == True:
+                                        type_tag = "gene_type"
+                                    else:
+                                        type_tag = options["gene_type"]
+                                    gene_type = first_transcript[type_tag][0]
                                 except KeyError:
+                                    logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                     gene_type = 'NA'
                                 try:
-                                    gene_name = first_transcript['gene_name'][0]
+                                    if options["gene_name"] == True:
+                                        name_tag = "gene_name"
+                                    else:
+                                        name_tag = options["gene_name"]
+                                    gene_name = first_transcript[name_tag][0]
                                 except KeyError:
+                                    logging.info("No gene name tag found for %s", gene['gene_id'][0])
                                     gene_name = 'NA'
                                     
                                 exons = db.children(gene, featuretype='exon', order_by='start') 

From 74da364d7cb17fc9471db15b4c1d35788c1451b8 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 17 Nov 2021 13:53:39 -0500
Subject: [PATCH 33/52] Invert default config value

---
 pisces/index.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index ad7812b..f9bc9b9 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -43,7 +43,7 @@ def build_index(args, unknown_args):
             if args.indices and index_name not in args.indices:
                 continue
             pprint(dataset, indent=4)
-            options = defaultdict(lambda: True)
+            options = defaultdict(lambda: False)
             options.update(dataset["options"])
             index_dir_path = os.path.join(index_dir_base, species, index_name)
             if os.path.exists(index_dir_path):
@@ -320,7 +320,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         featuretype='transcript',
                                         order_by='start'))
                                 try:
-                                    if options["gene_type"] == True:
+                                    if not options["gene_type"]:
                                         type_tag = "gene_type"
                                     else:
                                         type_tag = options["gene_type"]
@@ -329,7 +329,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                     logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                     gene_type = 'NA'
                                 try:
-                                    if options["gene_name"] == True:
+                                    if not options["gene_name"]:
                                         name_tag = "gene_name"
                                     else:
                                         name_tag = options["gene_name"]

From 79a17578d201c665f1ea0c2b75c6c2850e9ba106 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 17 Nov 2021 14:28:39 -0500
Subject: [PATCH 34/52] Revert config truthiness

---
 pisces/index.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index f9bc9b9..ad7812b 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -43,7 +43,7 @@ def build_index(args, unknown_args):
             if args.indices and index_name not in args.indices:
                 continue
             pprint(dataset, indent=4)
-            options = defaultdict(lambda: False)
+            options = defaultdict(lambda: True)
             options.update(dataset["options"])
             index_dir_path = os.path.join(index_dir_base, species, index_name)
             if os.path.exists(index_dir_path):
@@ -320,7 +320,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         featuretype='transcript',
                                         order_by='start'))
                                 try:
-                                    if not options["gene_type"]:
+                                    if options["gene_type"] == True:
                                         type_tag = "gene_type"
                                     else:
                                         type_tag = options["gene_type"]
@@ -329,7 +329,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                     logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                     gene_type = 'NA'
                                 try:
-                                    if not options["gene_name"]:
+                                    if options["gene_name"] == True:
                                         name_tag = "gene_name"
                                     else:
                                         name_tag = options["gene_name"]

From ce9e0c32c1e8dfb0f5cd9cca7886e2cbe11f5d73 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 17 Nov 2021 14:43:04 -0500
Subject: [PATCH 35/52] Write annotation before merging exons and calculating
 interfeatures

---
 pisces/index.py | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index ad7812b..a60baf6 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -297,23 +297,6 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                 total=db.count_features_of_type('gene'),
                                 unit='gene') as pbar:
                             for gene in db.features_of_type('gene'):
-                                transcripts = db.children(gene, featuretype='transcript', order_by='start')
-                                for transcript in transcripts:
-                                    fa_seq, frac_masked = features_to_string(db.children(transcript, 
-                                                                                   featuretype='exon', 
-                                                                                   order_by='start'), 
-                                                                       reference, 
-                                                                       masked=options["masked"])
-                                    transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n')
-                                    transcripts_fasta.write(fa_seq + '\n')
-                                if options["unprocessed_transcripts"]:
-                                    exons = db.children(gene, featuretype='exon', order_by='start') 
-                                    merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive))
-                                    introns = db.interfeatures(merged_exons, new_featuretype='intron')                                                     
-                                    transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n')
-                                    fa_seq, _ = features_to_string(introns, reference, masked=options["masked"])
-                                    transcripts_fasta.write(fa_seq + '\n')
-                                
                                 first_transcript = next(
                                     db.children(
                                         gene,
@@ -338,8 +321,28 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                     logging.info("No gene name tag found for %s", gene['gene_id'][0])
                                     gene_name = 'NA'
                                     
+                                transcripts = db.children(gene, featuretype='transcript', order_by='start')
+                                for transcript in transcripts:
+                                    # Write entry in the transcripts to genes table
+                                    gene2tx.write("{txp}\t{gene}\n".format(
+                                        gene=gene['gene_id'][0],
+                                        txp=transcript['transcript_id'][0]))
+                                    # Construct the transcript sequences and write them to the FASTA
+                                    fa_seq, frac_masked = features_to_string(db.children(transcript, 
+                                                                                   featuretype='exon', 
+                                                                                   order_by='start'), 
+                                                                       reference, 
+                                                                       masked=options["masked"])
+                                    transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n')
+                                    transcripts_fasta.write(fa_seq + '\n')
+                                    
                                 exons = db.children(gene, featuretype='exon', order_by='start') 
                                 merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive))
+                                if options["unprocessed_transcripts"]:
+                                    introns = db.interfeatures(merged_exons, new_featuretype='intron')                                                     
+                                    transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n')
+                                    fa_seq, _ = features_to_string(introns, reference, masked=options["masked"])
+                                    transcripts_fasta.write(fa_seq + '\n')
                                 
                                 annotation.write(
                                     "{gene}\t{type}\t{name}\t{chrom}\t{start}\t{stop}\t{length}\t{frac_masked}\n".
@@ -352,14 +355,13 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         chrom=gene.chrom,
                                         length=sum(len(exon) for exon in merged_exons),
                                         frac_masked=str(frac_masked)))
+                                        
                                 transcripts = db.children(
                                     gene,
                                     featuretype='transcript',
                                     order_by='start')
                                 for transcript in transcripts:
-                                    gene2tx.write("{txp}\t{gene}\n".format(
-                                        gene=gene['gene_id'][0],
-                                        txp=transcript['transcript_id'][0]))
+                                    
                                 pbar.update(1)
                             
                     if options["intergenes"]:

From 0218188e0d6f27a71e63fbecf7fe8ace7806d39c Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 17 Nov 2021 14:45:09 -0500
Subject: [PATCH 36/52] typo

---
 pisces/index.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index a60baf6..f1b0d6a 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -360,8 +360,6 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                     gene,
                                     featuretype='transcript',
                                     order_by='start')
-                                for transcript in transcripts:
-                                    
                                 pbar.update(1)
                             
                     if options["intergenes"]:

From 14d0c2245301680049058be667bede4fa8b380d5 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 17 Nov 2021 15:02:19 -0500
Subject: [PATCH 37/52] Grab GTF tags from exons

---
 pisces/index.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index f1b0d6a..bdcf299 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -297,17 +297,17 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                 total=db.count_features_of_type('gene'),
                                 unit='gene') as pbar:
                             for gene in db.features_of_type('gene'):
-                                first_transcript = next(
+                                first_exon = next(
                                     db.children(
                                         gene,
-                                        featuretype='transcript',
+                                        featuretype='exon',
                                         order_by='start'))
                                 try:
                                     if options["gene_type"] == True:
                                         type_tag = "gene_type"
                                     else:
                                         type_tag = options["gene_type"]
-                                    gene_type = first_transcript[type_tag][0]
+                                    gene_type = first_exon[type_tag][0]
                                 except KeyError:
                                     logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                     gene_type = 'NA'
@@ -316,7 +316,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         name_tag = "gene_name"
                                     else:
                                         name_tag = options["gene_name"]
-                                    gene_name = first_transcript[name_tag][0]
+                                    gene_name = first_exon[name_tag][0]
                                 except KeyError:
                                     logging.info("No gene name tag found for %s", gene['gene_id'][0])
                                     gene_name = 'NA'

From fb1115a7d910e365570da81639e332d3baca7527 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Wed, 30 Mar 2022 09:58:30 -0400
Subject: [PATCH 38/52] Don't don't coerce data.frame to vector when
 length(columns) == 1

This is another fix for #24
---
 pisces/R/make_expression_matrix.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R
index abfb26f..06d5c84 100755
--- a/pisces/R/make_expression_matrix.R
+++ b/pisces/R/make_expression_matrix.R
@@ -150,8 +150,8 @@ rescaleTPM <- function(mat, columns = colnames(mat), rows = 1:nrow(mat), precisi
     tpm.scale.factors <- 1e+06/sums.before.norm
     scaled_df <- mat
     scaled_df[rows, columns] <- data.frame(mapply("*", scaled_df[rows, 
-        columns], tpm.scale.factors))
-    scaled_df[, columns] <- round(scaled_df[, columns], precision)
+        columns, drop=F], tpm.scale.factors))
+    scaled_df[, columns] <- round(scaled_df[, columns, drop=F], precision)
     return(scaled_df)
     }
     

From fc09cc1de2a48147bd61e5b4e119982743373aba Mon Sep 17 00:00:00 2001
From: Matt Shirley <shirlma2@c02g91llq05q.local>
Date: Thu, 26 May 2022 14:51:06 +0000
Subject: [PATCH 39/52] Use zgrep

---
 pisces/__init__.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index b41e8d2..7c4062c 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -22,7 +22,7 @@
 
 __version__ = get_distribution("novartis_pisces").version
 
-unique_id = ''.join(random.choice(string.digits) for _ in range(10))    
+unique_id = ''.join(random.choice(string.digits) for _ in range(10))
 
 def find_data_directory():
     """ Returns the path to the module directory """
@@ -36,10 +36,10 @@ def install_salmon():
     from urllib.request import urlopen
     from shutil import rmtree
     from subprocess import call
-    
+
     redist = os.path.join(find_data_directory(), 'redist')
     rmtree(os.path.join(redist, "salmon"), ignore_errors=True)
-   
+
     if platform.system() == "Linux":
         salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/linux-64/salmon-1.3.0-hf69c8f4_0.tar.bz2"
     elif platform.system() == "Darwin":
@@ -51,7 +51,7 @@ def install_salmon():
         tar_file.seek(0)
         with tarfile.open(fileobj=tar_file) as tar:
             tar.extractall(path=os.path.join(redist, "salmon"))
-            
+
 def install_r_dependencies():
     """Install R dependencies"""
     cmd = [
@@ -59,7 +59,7 @@ def install_r_dependencies():
         os.path.join(find_data_directory(), 'R/set_up_dependencies.R')
     ]
     call(cmd)
-    
+
 def install_dependencies():
     install_salmon()
     install_r_dependencies()
@@ -69,7 +69,7 @@ def sra_valid_accession(accession):
     if accession.startswith('SRR') and len(accession) == 10:
         return True
     return False
-    
+
 def long_substr(data):
     """ http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python """
     substr = ''
@@ -149,7 +149,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir,
                 kmers2_outname, 'w') as kmers2_out:
             p1 = Popen(
                 [
-                    'fgrep', '-h', '-o', '-f',
+                    'zgrep', '-h', '-o', '-f',
                     os.path.join(data_dir, 'data/fp_kmers_col1.txt'),
                     ' '.join(fastq_1)
                 ],
@@ -157,7 +157,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir,
                 stderr=PIPE)
             p2 = Popen(
                 [
-                    'fgrep', '-h', '-o', '-f',
+                    'zgrep', '-h', '-o', '-f',
                     os.path.join(data_dir, 'data/fp_kmers_col1.txt'),
                     ' '.join(fastq_2)
                 ],
@@ -169,7 +169,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir,
         with open(kmers1_outname, 'w') as kmers1_out:
             p1 = Popen(
                 [
-                    'fgrep', '-h', '-o', '-f',
+                    'zgrep', '-h', '-o', '-f',
                     os.path.join(data_dir, 'data/fp_kmers_col1.txt'),
                     ' '.join(fastq_1)
                 ],

From 6752dce522db8cfbdc416bb911ebaf6e05f74f42 Mon Sep 17 00:00:00 2001
From: Matt Shirley <shirlma2@c02g91llq05q.local>
Date: Thu, 26 May 2022 15:00:23 +0000
Subject: [PATCH 40/52] Update gitignore

---
 .gitignore | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitignore b/.gitignore
index 0205d62..436ac46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,8 @@
 *.pyc
 .DS_Store
+.eggs
+.tox
+build
+dist
+docs
+*.egg-info
\ No newline at end of file

From 979dd9e3fb75db9bf0b0400b239f3b3491554af7 Mon Sep 17 00:00:00 2001
From: Matt Shirley <shirlma2@c02g91llq05q.local>
Date: Thu, 26 May 2022 15:01:21 +0000
Subject: [PATCH 41/52] Add Dockerfile

---
 Dockerfile | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100755 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100755
index 0000000..b73f15c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,16 @@
+FROM ubuntu:18.04
+
+COPY . /pisces
+WORKDIR /pisces
+
+# Install R
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/'
+RUN apt update
+RUN apt install r-base
+
+# Install PISCES
+RUN python setup.py install
+RUN pisces dependencies
+
+ENTRYPOINT ["/usr/local/bin/pisces"]
\ No newline at end of file

From 420f8f064cac85cf99203f8caed2ac86f85ba59a Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Thu, 26 May 2022 15:25:37 -0400
Subject: [PATCH 42/52] Use fixed grep patterns

---
 pisces/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pisces/__init__.py b/pisces/__init__.py
index 7c4062c..35e17cd 100755
--- a/pisces/__init__.py
+++ b/pisces/__init__.py
@@ -149,7 +149,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir,
                 kmers2_outname, 'w') as kmers2_out:
             p1 = Popen(
                 [
-                    'zgrep', '-h', '-o', '-f',
+                    'zgrep', '-F', '-h', '-o', '-f',
                     os.path.join(data_dir, 'data/fp_kmers_col1.txt'),
                     ' '.join(fastq_1)
                 ],
@@ -157,7 +157,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir,
                 stderr=PIPE)
             p2 = Popen(
                 [
-                    'zgrep', '-h', '-o', '-f',
+                    'zgrep', '-F', '-h', '-o', '-f',
                     os.path.join(data_dir, 'data/fp_kmers_col1.txt'),
                     ' '.join(fastq_2)
                 ],
@@ -169,7 +169,7 @@ def fingerprint_sample(args, fastq_1, fastq_2, data_dir, output_dir,
         with open(kmers1_outname, 'w') as kmers1_out:
             p1 = Popen(
                 [
-                    'zgrep', '-h', '-o', '-f',
+                    'zgrep', '-F', '-h', '-o', '-f',
                     os.path.join(data_dir, 'data/fp_kmers_col1.txt'),
                     ' '.join(fastq_1)
                 ],

From 01c5f5578248ba13a30b71d4c1866b4481787daa Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Thu, 5 Jan 2023 00:08:26 +0000
Subject: [PATCH 43/52] Use checklines=1 to avoid inconsistent dialect warnings

---
 pisces/index.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index bdcf299..de27b3a 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -61,7 +61,7 @@ def build_index(args, unknown_args):
 
             transcripts_fasta_file = os.path.join(
                 index_dir_path, "transcripts", "transcripts.fa")
-                
+
             with open(transcripts_fasta_file, 'w') as transcripts_fasta:
                 ## all of this URI handling should probably use an existing library like
                 ## https://github.com/intake/filesystem_spec
@@ -104,12 +104,12 @@ def build_index(args, unknown_args):
                                         fasta.write(str(twobit[chrom]) + '\n')
                             reference = Fasta(
                                 _fasta_local_path.replace("2bit", "fa"))
-                    
+
                     with open(_fasta_local_path) as extra:
                         logging.info("Adding entries from %s", fasta)
                         for line in extra:
                             transcripts_fasta.write(line)
-                            
+
                 for gtf_loc, fasta_loc in zip(dataset["gtfs"],
                                               dataset["fastas"]):
                     gtf = urlparse(gtf_loc)
@@ -172,6 +172,7 @@ def build_index(args, unknown_args):
                                 db = gffutils.create_db(
                                     gtf.path,
                                     database_filename,
+                                    checklines=1,
                                     disable_infer_genes=
                                     not options["infer_features"],
                                     disable_infer_transcripts=
@@ -188,6 +189,7 @@ def build_index(args, unknown_args):
                                     db = gffutils.create_db(
                                         gtf.path,
                                         tmp_db,
+                                        checklines=1,
                                         disable_infer_genes=
                                         not options["infer_features"],
                                         disable_infer_transcripts=
@@ -220,6 +222,7 @@ def build_index(args, unknown_args):
                                             _gtf_local_path.replace(".gz", ""),
                                             _gtf_local_path.replace(
                                                 ".gz", "") + '.db',
+                                            checklines=1,
                                             disable_infer_genes=
                                             not options["infer_features"],
                                             disable_infer_transcripts=
@@ -231,6 +234,7 @@ def build_index(args, unknown_args):
                                         db = gffutils.create_db(
                                             _gtf_local_path,
                                             _gtf_local_path + '.db',
+                                            checklines=1,
                                             disable_infer_genes=
                                             not options["infer_features"],
                                             disable_infer_transcripts=
@@ -261,9 +265,9 @@ def build_index(args, unknown_args):
                     gene_annotation = os.path.join(
                         index_dir_path,
                         assembly + "_gene_annotation.txt")
-                        
+
                     def features_to_string(features, fasta_in, masked=True, strand=True):
-                        """ 
+                        """
                         """
                         sequences = []
                         feature_strand = "."
@@ -272,7 +276,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                             sequences.append(
                                 feature.sequence(
                                     fasta_in, use_strand=strand))
-                        # if the transcript is on the reverse strand, reverse order of exons 
+                        # if the transcript is on the reverse strand, reverse order of exons
                         # before concatenating
                         if feature_strand == "-":
                             sequences = sequences[::-1]
@@ -320,7 +324,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                 except KeyError:
                                     logging.info("No gene name tag found for %s", gene['gene_id'][0])
                                     gene_name = 'NA'
-                                    
+
                                 transcripts = db.children(gene, featuretype='transcript', order_by='start')
                                 for transcript in transcripts:
                                     # Write entry in the transcripts to genes table
@@ -328,22 +332,22 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         gene=gene['gene_id'][0],
                                         txp=transcript['transcript_id'][0]))
                                     # Construct the transcript sequences and write them to the FASTA
-                                    fa_seq, frac_masked = features_to_string(db.children(transcript, 
-                                                                                   featuretype='exon', 
-                                                                                   order_by='start'), 
-                                                                       reference, 
+                                    fa_seq, frac_masked = features_to_string(db.children(transcript,
+                                                                                   featuretype='exon',
+                                                                                   order_by='start'),
+                                                                       reference,
                                                                        masked=options["masked"])
                                     transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n')
                                     transcripts_fasta.write(fa_seq + '\n')
-                                    
-                                exons = db.children(gene, featuretype='exon', order_by='start') 
+
+                                exons = db.children(gene, featuretype='exon', order_by='start')
                                 merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive))
                                 if options["unprocessed_transcripts"]:
-                                    introns = db.interfeatures(merged_exons, new_featuretype='intron')                                                     
+                                    introns = db.interfeatures(merged_exons, new_featuretype='intron')
                                     transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n')
                                     fa_seq, _ = features_to_string(introns, reference, masked=options["masked"])
                                     transcripts_fasta.write(fa_seq + '\n')
-                                
+
                                 annotation.write(
                                     "{gene}\t{type}\t{name}\t{chrom}\t{start}\t{stop}\t{length}\t{frac_masked}\n".
                                     format(
@@ -355,13 +359,13 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         chrom=gene.chrom,
                                         length=sum(len(exon) for exon in merged_exons),
                                         frac_masked=str(frac_masked)))
-                                        
+
                                 transcripts = db.children(
                                     gene,
                                     featuretype='transcript',
                                     order_by='start')
                                 pbar.update(1)
-                            
+
                     if options["intergenes"]:
                         for seqid in reference.keys():
                             logging.info("Merging overlapping genes on %s", seqid)

From 49b5fb59d5cd145155e83f9f6475aefd8ebcd9ce Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Thu, 5 Jan 2023 18:06:49 +0000
Subject: [PATCH 44/52] Allow a formal GTF dialect

---
 pisces/config.json |  3 +++
 pisces/index.py    | 22 ++++++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/pisces/config.json b/pisces/config.json
index 2e49e3f..733c79a 100755
--- a/pisces/config.json
+++ b/pisces/config.json
@@ -11,6 +11,7 @@
                 "options": {
                     "-k": 31,
                     "infer_features": false,
+                    "infer_dialect": true,
                     "gtf_name_tag": "gene_name",
                     "gtf_type_tag": "gene_type"
                 }
@@ -29,6 +30,7 @@
                 "options": {
                     "-k": 31,
                     "infer_features": false,
+                    "infer_dialect": true,
                     "gtf_name_tag": "gene_name",
                     "gtf_type_tag": "gene_type"
                 }
@@ -49,6 +51,7 @@
                 "options": {
                     "-k": 31,
                     "infer_features": false,
+                    "infer_dialect": true,
                     "gtf_name_tag": "gene_name",
                     "gtf_type_tag": "gene_type"
                 }
diff --git a/pisces/index.py b/pisces/index.py
index de27b3a..a57edd0 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -12,6 +12,7 @@ def build_index(args, unknown_args):
     import gffutils.merge_criteria as mc
     import atexit
     import shutil
+    import copy
     from tqdm import tqdm
     from collections import defaultdict
     from pprint import pprint
@@ -159,6 +160,19 @@ def build_index(args, unknown_args):
                         else:
                             reference = Fasta(_fasta_local_path)
 
+                    if not options["infer_dialect"]:
+                        ## set up a GTF dialect for gffutils
+                        from gffutils.constants import dialect as gff_dialect
+                        dialect = copy.copy(gff_dialect)
+                        dialect['field separator'] = '; '
+                        dialect['keyval separator'] = ' '
+                        dialect['fmt'] = 'gtf'
+                        dialect['quoted GFF2 values'] = True
+                        dialect['repeated keys'] = True
+                        dialect['trailing semicolon'] = True
+                    else:
+                        dialect = None
+
                     if gtf.scheme == '':
                         database_filename = gtf.path + '.db'
                         if os.path.exists(database_filename):
@@ -172,7 +186,7 @@ def build_index(args, unknown_args):
                                 db = gffutils.create_db(
                                     gtf.path,
                                     database_filename,
-                                    checklines=1,
+                                    dialect=dialect,
                                     disable_infer_genes=
                                     not options["infer_features"],
                                     disable_infer_transcripts=
@@ -189,7 +203,7 @@ def build_index(args, unknown_args):
                                     db = gffutils.create_db(
                                         gtf.path,
                                         tmp_db,
-                                        checklines=1,
+                                        dialect=dialect,
                                         disable_infer_genes=
                                         not options["infer_features"],
                                         disable_infer_transcripts=
@@ -222,7 +236,7 @@ def build_index(args, unknown_args):
                                             _gtf_local_path.replace(".gz", ""),
                                             _gtf_local_path.replace(
                                                 ".gz", "") + '.db',
-                                            checklines=1,
+                                            dialect=dialect,
                                             disable_infer_genes=
                                             not options["infer_features"],
                                             disable_infer_transcripts=
@@ -234,7 +248,7 @@ def build_index(args, unknown_args):
                                         db = gffutils.create_db(
                                             _gtf_local_path,
                                             _gtf_local_path + '.db',
-                                            checklines=1,
+                                            dialect=dialect,
                                             disable_infer_genes=
                                             not options["infer_features"],
                                             disable_infer_transcripts=

From a13cbedcfb81306aaaf7aafd53773b009e8b8a40 Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Fri, 6 Jan 2023 14:24:26 +0000
Subject: [PATCH 45/52] Pull gene tags from gene record

---
 pisces/index.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index a57edd0..22a63c8 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -162,6 +162,7 @@ def build_index(args, unknown_args):
 
                     if not options["infer_dialect"]:
                         ## set up a GTF dialect for gffutils
+                        ## this dialect works for https://github.com/daler/gffutils/issues/198
                         from gffutils.constants import dialect as gff_dialect
                         dialect = copy.copy(gff_dialect)
                         dialect['field separator'] = '; '
@@ -315,17 +316,12 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                 total=db.count_features_of_type('gene'),
                                 unit='gene') as pbar:
                             for gene in db.features_of_type('gene'):
-                                first_exon = next(
-                                    db.children(
-                                        gene,
-                                        featuretype='exon',
-                                        order_by='start'))
                                 try:
                                     if options["gene_type"] == True:
                                         type_tag = "gene_type"
                                     else:
                                         type_tag = options["gene_type"]
-                                    gene_type = first_exon[type_tag][0]
+                                    gene_type = gene[type_tag][0]
                                 except KeyError:
                                     logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                     gene_type = 'NA'
@@ -334,7 +330,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         name_tag = "gene_name"
                                     else:
                                         name_tag = options["gene_name"]
-                                    gene_name = first_exon[name_tag][0]
+                                    gene_name = gene[name_tag][0]
                                 except KeyError:
                                     logging.info("No gene name tag found for %s", gene['gene_id'][0])
                                     gene_name = 'NA'

From 5cbf32e809a823b720af5dedf8f3ca3fc17c6e25 Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Fri, 6 Jan 2023 14:54:03 +0000
Subject: [PATCH 46/52] Use correct GTF tags for gene names and biotypes

---
 pisces/index.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index 22a63c8..de325d4 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -317,19 +317,19 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                 unit='gene') as pbar:
                             for gene in db.features_of_type('gene'):
                                 try:
-                                    if options["gene_type"] == True:
-                                        type_tag = "gene_type"
+                                    if options["gtf_type_tag"] == True:
+                                        type_tag = "gene_biotype"
                                     else:
-                                        type_tag = options["gene_type"]
+                                        type_tag = options["gtf_type_tag"]
                                     gene_type = gene[type_tag][0]
                                 except KeyError:
                                     logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                     gene_type = 'NA'
                                 try:
-                                    if options["gene_name"] == True:
-                                        name_tag = "gene_name"
+                                    if options["gtf_name_tag"] == True:
+                                        name_tag = "gene_id"
                                     else:
-                                        name_tag = options["gene_name"]
+                                        name_tag = options["gtf_name_tag"]
                                     gene_name = gene[name_tag][0]
                                 except KeyError:
                                     logging.info("No gene name tag found for %s", gene['gene_id'][0])

From a8fd0b04d39844b5ff3bf7770581750de603e0a4 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Tue, 1 Aug 2023 11:11:13 -0400
Subject: [PATCH 47/52] Update main.yml

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 22a44ce..29f3ec3 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,7 +30,7 @@ jobs:
           python-version: ${{ matrix.python }}
           
       - name: Setup R
-        uses: r-lib/actions/setup-r@master
+        uses: r-lib/actions/setup-r@v2
         with:
             r-version: '3.6.2'
             

From 9936079ac75d4b75be95bad5bc962465e8c5f458 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Tue, 1 Aug 2023 14:47:09 -0400
Subject: [PATCH 48/52] Update publish-to-test-pypi.yml

---
 .github/workflows/publish-to-test-pypi.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml
index 20abfa5..2b4dc28 100644
--- a/.github/workflows/publish-to-test-pypi.yml
+++ b/.github/workflows/publish-to-test-pypi.yml
@@ -4,7 +4,7 @@ on: push
 jobs:
   build-n-publish:
     name: Build and publish packages to PyPI and TestPyPI
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
     
     steps:
       - name: Checkout
@@ -34,4 +34,4 @@ jobs:
         if: startsWith(github.ref, 'refs/tags')
         uses: pypa/gh-action-pypi-publish@master
         with:
-            password: ${{ secrets.pypi_password }}
\ No newline at end of file
+            password: ${{ secrets.pypi_password }}

From 99fd110c4b55fad3298d17f5ac5ecfc4f1b6f7e0 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Tue, 28 May 2024 09:00:49 -0400
Subject: [PATCH 49/52] Update build_docs.yml

---
 .github/workflows/build_docs.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml
index a9b0252..93f80c3 100644
--- a/.github/workflows/build_docs.yml
+++ b/.github/workflows/build_docs.yml
@@ -27,6 +27,9 @@ jobs:
         uses: r-lib/actions/setup-r@master
         with:
             r-version: '3.6.2'
+
+      - name: Install renv
+        run: Rscript -e "install.packages(c('renv'), repos='https://cran.rstudio.com')"
             
       - name: Install python packages
         run: pip install tox

From 8739201b818457ba7e2c9a08aa0f6f794b2778ee Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Tue, 28 May 2024 09:06:21 -0400
Subject: [PATCH 50/52] Update build_docs.yml

---
 .github/workflows/build_docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml
index 93f80c3..ee2a79e 100644
--- a/.github/workflows/build_docs.yml
+++ b/.github/workflows/build_docs.yml
@@ -24,7 +24,7 @@ jobs:
           python-version: 3.9
           
       - name: Setup R
-        uses: r-lib/actions/setup-r@master
+        uses: r-lib/actions/setup-r@v2
         with:
             r-version: '3.6.2'
 

From d6909a155d57679e8d81c9084e5f904fbdcd3ddb Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Tue, 28 May 2024 09:29:37 -0400
Subject: [PATCH 51/52] Add section describing composite transcriptomes

---
 .gitignore            |  3 ++-
 docsource/running.rst | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 436ac46..e1d07cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,5 @@
 build
 dist
 docs
-*.egg-info
\ No newline at end of file
+*.egg-info
+.vscode/settings.json
diff --git a/docsource/running.rst b/docsource/running.rst
index 59649fe..68fcc09 100644
--- a/docsource/running.rst
+++ b/docsource/running.rst
@@ -65,6 +65,16 @@ explicitly, or default to automatic values.
 
 .. command-output:: pisces run --help
 
+The (``-t, --sample-type``) flag is used to specify the species name, 
+which will select the index files ``salmon quant`` uses to quantify transcript expression from the selected 
+set of fastq files. The species name is the top level key in the :doc:`config` file. If the ``--sample-type`` 
+defines a transcriptome composed of transcripts from multiple species/assemblies (GTF/FASTA pairs), the 
+transcript expression will be quantified using a single combined index with transcript abundance jointly 
+calculated on all transcripts from the composite transcriptome. The ``pisces summarize-expression`` command will separate transcript and gene 
+expression estimantes (counts and TPM) into separate output tables for each species/assembly and re-scale TPM values as if the 
+transcript and gene expression were estimated independently. This method ensures that sequence reads which map ambiguously 
+between similar transcripts will be assigned to the most likely transcript during quantification.
+
 .. _submit_example:
 
 pisces submit

From 36ca8f16e6b763abf341d78a15844d40696e3f37 Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Tue, 28 May 2024 09:38:00 -0400
Subject: [PATCH 52/52] Update pisces_dependencies command

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index d56bf87..cc32cda 100644
--- a/tox.ini
+++ b/tox.ini
@@ -10,7 +10,7 @@ changedir = docsource
 deps = sphinx
        sphinxcontrib.bibtex
        sphinxcontrib.programoutput
-commands = pisces dependencies
+commands = pisces_dependencies
            make github
 
 [testenv:package]