change input fastq file naming convention to a more generic form

leahkemp · Nov 2, 2021 · 9657608 · 9657608
1 parent b95b92a
commit 9657608
Show file tree

Hide file tree

Showing 16 changed files with 20 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -85,10 +85,11 @@ git clone https://github.com/leahkemp/smncrna_analysis_template.git
 #### Fastq naming convention
 
 ```bash
-sample_S*_R1.fastq.gz
+sample.fastq.gz
 ```
 
 - one fastq file per sample
+- sample name matching the sample names in the metadata file and ".fastq.gz" extension
 
 For example see the test fastq files [here](./test/fastq/)
 

diff --git a/diff_expression/diff_expression.Rmd b/diff_expression/diff_expression.Rmd
@@ -56,39 +56,39 @@ mirna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_re
                                         stringsAsFactors = FALSE,
                                         check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 pirna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                         "exceRpt_piRNA_ReadCounts.txt"),
                                         header = TRUE,
                                         stringsAsFactors = FALSE,
                                         check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 trna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                        "exceRpt_tRNA_ReadCounts.txt"),
                                        header = TRUE,
                                        stringsAsFactors = FALSE,
                                        check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 circrna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                           "exceRpt_circularRNA_ReadCounts.txt"),
                                           header = TRUE,
                                           stringsAsFactors = FALSE,
                                           comment.char = "") %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 gencode_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                           "exceRpt_gencode_ReadCounts.txt"),
                                           header = TRUE,
                                           stringsAsFactors = FALSE,
                                           check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 mirna_smrnaseq_data <- utils::read.table(base::file.path(config$smrnaseq_results_dir,
                                                          "/edgeR/miRBase_mature/mature_counts.csv"),
@@ -100,7 +100,7 @@ mirna_smrnaseq_data <- utils::read.table(base::file.path(config$smrnaseq_results
   base::as.data.frame() %>%
   janitor::row_to_names(row_number = 1) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[.]mature", "", .))
+  dplyr::rename_with(~ base::sub(".mature", "", .))
 
 # formatting to make counts datasets consistent between smrnaseq and excerpt pipelines
 mirna_smrnaseq_data <- mirna_smrnaseq_data %>%

diff --git a/heatmaps/heatmaps.Rmd b/heatmaps/heatmaps.Rmd
@@ -46,39 +46,39 @@ mirna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_re
                                         stringsAsFactors = FALSE,
                                         check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 pirna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                         "exceRpt_piRNA_ReadCounts.txt"),
                                         header = TRUE,
                                         stringsAsFactors = FALSE,
                                         check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 trna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                        "exceRpt_tRNA_ReadCounts.txt"),
                                        header = TRUE,
                                        stringsAsFactors = FALSE,
                                        check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 circrna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                           "exceRpt_circularRNA_ReadCounts.txt"),
                                           header = TRUE,
                                           stringsAsFactors = FALSE,
                                           comment.char = "") %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 gencode_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                           "exceRpt_gencode_ReadCounts.txt"),
                                           header = TRUE,
                                           stringsAsFactors = FALSE,
                                           check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 mirna_smrnaseq_data <- utils::read.table(base::file.path(config$smrnaseq_results_dir,
                                                          "/edgeR/miRBase_mature/mature_counts.csv"),
@@ -90,7 +90,7 @@ mirna_smrnaseq_data <- utils::read.table(base::file.path(config$smrnaseq_results
   base::as.data.frame() %>%
   janitor::row_to_names(row_number = 1) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[.]mature", "", .))
+  dplyr::rename_with(~ base::sub(".mature", "", .))
 
 # formatting to make counts datasets consistent between smrnaseq and excerpt pipelines
 mirna_smrnaseq_data <- mirna_smrnaseq_data %>%

diff --git a/prepare_counts/prepare_counts.R b/prepare_counts/prepare_counts.R
@@ -16,39 +16,39 @@ mirna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_re
                                         stringsAsFactors = FALSE,
                                         check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 pirna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                         "exceRpt_piRNA_ReadCounts.txt"),
                                         header = TRUE,
                                         stringsAsFactors = FALSE,
                                         check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 trna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                        "exceRpt_tRNA_ReadCounts.txt"),
                                        header = TRUE,
                                        stringsAsFactors = FALSE,
                                        check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 circrna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                           "exceRpt_circularRNA_ReadCounts.txt"),
                                           header = TRUE,
                                           stringsAsFactors = FALSE,
                                           comment.char = "") %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 gencode_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                           "exceRpt_gencode_ReadCounts.txt"),
                                           header = TRUE,
                                           stringsAsFactors = FALSE,
                                           check.names = FALSE) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[_]R1.fastq", "", .))
+  dplyr::rename_with(~ base::sub(".fastq", "", .))
 
 mirna_smrnaseq_data <- utils::read.table(base::file.path(config$smrnaseq_results_dir,
                                                          "/edgeR/miRBase_mature/mature_counts.csv"),
@@ -60,7 +60,7 @@ mirna_smrnaseq_data <- utils::read.table(base::file.path(config$smrnaseq_results
   base::as.data.frame() %>%
   janitor::row_to_names(row_number = 1) %>%
   # remove S*_R1.fastq suffix from the sample/column names
-  dplyr::rename_with(~ base::sub("[_][S]\\d+[.]mature", "", .))
+  dplyr::rename_with(~ base::sub(".mature", "", .))
 
 # formatting to make counts datasets consistent between smrnaseq and excerpt pipelines
 mirna_smrnaseq_data <- mirna_smrnaseq_data %>%

diff --git a/test/fastq/sample01_S1_R1.fastq.gz → test/fastq/sample01.fastq.gz b/test/fastq/sample01_S1_R1.fastq.gz → test/fastq/sample01.fastq.gz
diff --git a/test/fastq/sample02_S1_R1.fastq.gz → test/fastq/sample02.fastq.gz b/test/fastq/sample02_S1_R1.fastq.gz → test/fastq/sample02.fastq.gz
diff --git a/test/fastq/sample03_S1_R1.fastq.gz → test/fastq/sample03.fastq.gz b/test/fastq/sample03_S1_R1.fastq.gz → test/fastq/sample03.fastq.gz
diff --git a/test/fastq/sample04_S1_R1.fastq.gz → test/fastq/sample04.fastq.gz b/test/fastq/sample04_S1_R1.fastq.gz → test/fastq/sample04.fastq.gz
diff --git a/test/fastq/sample05_S1_R1.fastq.gz → test/fastq/sample05.fastq.gz b/test/fastq/sample05_S1_R1.fastq.gz → test/fastq/sample05.fastq.gz
diff --git a/test/fastq/sample06_S1_R1.fastq.gz → test/fastq/sample06.fastq.gz b/test/fastq/sample06_S1_R1.fastq.gz → test/fastq/sample06.fastq.gz
diff --git a/test/fastq/sample07_S1_R1.fastq.gz → test/fastq/sample07.fastq.gz b/test/fastq/sample07_S1_R1.fastq.gz → test/fastq/sample07.fastq.gz
diff --git a/test/fastq/sample08_S1_R1.fastq.gz → test/fastq/sample08.fastq.gz b/test/fastq/sample08_S1_R1.fastq.gz → test/fastq/sample08.fastq.gz
diff --git a/test/fastq/sample09_S1_R1.fastq.gz → test/fastq/sample09.fastq.gz b/test/fastq/sample09_S1_R1.fastq.gz → test/fastq/sample09.fastq.gz
diff --git a/test/fastq/sample10_S1_R1.fastq.gz → test/fastq/sample10.fastq.gz b/test/fastq/sample10_S1_R1.fastq.gz → test/fastq/sample10.fastq.gz
diff --git a/test/fastq/sample11_S1_R1.fastq.gz → test/fastq/sample11.fastq.gz b/test/fastq/sample11_S1_R1.fastq.gz → test/fastq/sample11.fastq.gz
diff --git a/test/fastq/sample12_S1_R1.fastq.gz → test/fastq/sample12.fastq.gz b/test/fastq/sample12_S1_R1.fastq.gz → test/fastq/sample12.fastq.gz