forked from benjjneb/dada2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtaxonomy.R
742 lines (715 loc) · 33.1 KB
/
taxonomy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
#'
#' Classifies sequences against reference training dataset.
#'
#' assignTaxonomy implements the RDP Naive Bayesian Classifier algorithm described in
#' Wang et al. Applied and Environmental Microbiology 2007, with kmer size 8 and 100 bootstrap
#' replicates. Properly formatted reference files for several popular taxonomic databases
#' are available \url{http://benjjneb.github.io/dada2/training.html}
#'
#' @param seqs (Required). A character vector of the sequences to be assigned, or an object
#' coercible by \code{\link{getUniques}}.
#'
#' @param refFasta (Required). The path to the reference fasta file, or an
#' R connection Can be compressed.
#' This reference fasta file should be formatted so that the id lines correspond to the
#' taxonomy (or classification) of the associated sequence, and each taxonomic level is
#' separated by a semicolon. Eg.
#'
#' >Kingom;Phylum;Class;Order;Family;Genus;
#' ACGAATGTGAAGTAA......
#'
#' @param minBoot (Optional). Default 50.
#' The minimum bootstrap confidence for assigning a taxonomic level.
#'
#' @param tryRC (Optional). Default FALSE.
#' If TRUE, the reverse-complement of each sequences will be used for classification if it is a better match to the reference
#' sequences than the forward sequence.
#'
#' @param outputBootstraps (Optional). Default FALSE.
#' If TRUE, bootstrap values will be retained in an integer matrix. A named list containing the assigned taxonomies (named "taxa")
#' and the bootstrap values (named "boot") will be returned. Minimum bootstrap confidence filtering still takes place,
#' to see full taxonomy set minBoot=0
#'
#' @param taxLevels (Optional). Default is c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species").
#' The taxonomic levels being assigned. Truncates if deeper levels not present in
#' training fasta.
#'
#' @param multithread (Optional). Default is FALSE.
#' If TRUE, multithreading is enabled and the number of available threads is automatically determined.
#' If an integer is provided, the number of threads to use is set by passing the argument on to
#' \code{\link{setThreadOptions}}.
#'
#' @param verbose (Optional). Default FALSE.
#' If TRUE, print status to standard output.
#'
#' @return A character matrix of assigned taxonomies exceeding the minBoot level of
#' bootstrapping confidence. Rows correspond to the provided sequences, columns to the
#' taxonomic levels. NA indicates that the sequence was not consistently classified at
#' that level at the minBoot threshhold.
#'
#' If outputBootstraps is TRUE, a named list containing the assigned taxonomies (named "taxa")
#' and the bootstrap values (named "boot") will be returned.
#'
#' @export
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead sread
#' @importFrom ShortRead id
#'
#' @examples
#' seqs <- getSequences(system.file("extdata", "example_seqs.fa", package="dada2"))
#' training_fasta <- system.file("extdata", "example_train_set.fa.gz", package="dada2")
#' taxa <- assignTaxonomy(seqs, training_fasta)
#' taxa80 <- assignTaxonomy(seqs, training_fasta, minBoot=80, multithread=2)
#'
assignTaxonomy <- function(seqs, refFasta, minBoot=50, tryRC=FALSE, outputBootstraps=FALSE,
taxLevels=c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
multithread=FALSE, verbose=FALSE) {
MIN_REF_LEN <- 20 # Enforced minimum length of reference seqs. Must be bigger than the kmer-size used (8).
MIN_TAX_LEN <- 50 # Minimum length of input sequences to get a taxonomic assignment
# Get character vector of sequences
seqs <- getSequences(seqs)
if(min(nchar(seqs)) < MIN_TAX_LEN) {
warning("Some sequences were shorter than ", MIN_TAX_LEN, " nts and will not receive a taxonomic classification.")
}
# Read in the reference fasta
refsr <- readFasta(refFasta)
lens <- width(sread(refsr))
if(any(lens<MIN_REF_LEN)) {
refsr <- refsr[lens>=MIN_REF_LEN]
warning(paste0("Some reference sequences were too short (<", MIN_REF_LEN, "nts) and were excluded."))
}
refs <- as.character(sread(refsr))
tax <- as.character(id(refsr))
tax <- sapply(tax, function(x) gsub("^\\s+|\\s+$", "", x)) # Remove leading/trailing whitespace
# Sniff and parse UNITE fasta format
UNITE <- FALSE
if(all(grepl("FU\\|re[pf]s", tax[1:10]))) {
UNITE <- TRUE
cat("UNITE fungal taxonomic reference detected.\n")
tax <- sapply(strsplit(tax, "\\|"), `[`, 5)
tax <- gsub("[pcofg]__unidentified;", "_DADA2_UNSPECIFIED;", tax)
tax <- gsub(";s__(\\w+)_", ";s__", tax)
tax <- gsub(";s__sp$", ";_DADA2_UNSPECIFIED", tax)
}
# Crude format check
if(!grepl(";", tax[[1]])) {
if(length(unlist(strsplit(tax[[1]], "\\s")))==3) {
stop("Incorrect reference file format for assignTaxonomy (this looks like a file formatted for assignSpecies).")
} else {
stop("Incorrect reference file format for assignTaxonomy.")
}
}
# Parse the taxonomies from the id string
tax.depth <- sapply(strsplit(tax, ";"), length)
td <- max(tax.depth)
for(i in seq(length(tax))) {
if(tax.depth[[i]] < td) {
for(j in seq(td - tax.depth[[i]])) {
tax[[i]] <- paste0(tax[[i]], "_DADA2_UNSPECIFIED;")
}
}
}
# Create the integer maps from reference to type ("genus") and for each tax level
genus.unq <- unique(tax)
ref.to.genus <- match(tax, genus.unq)
tax.mat <- matrix(unlist(strsplit(genus.unq, ";")), ncol=td, byrow=TRUE)
tax.df <- as.data.frame(tax.mat)
for(i in seq(ncol(tax.df))) {
tax.df[,i] <- factor(tax.df[,i])
tax.df[,i] <- as.integer(tax.df[,i])
}
tax.mat.int <- as.matrix(tax.df)
### Assign
# Parse multithreading argument
if(is.logical(multithread)) {
if(multithread==TRUE) { RcppParallel::setThreadOptions(numThreads = "auto") }
else { RcppParallel::setThreadOptions(numThreads = 1) }
} else if(is.numeric(multithread)) {
RcppParallel::setThreadOptions(numThreads = multithread)
} else {
warning("Invalid multithread parameter. Running as a single thread.")
RcppParallel::setThreadOptions(numThreads = 1)
}
# Run C assignemnt code
assignment <- C_assign_taxonomy2(seqs, rc(seqs), refs, ref.to.genus, tax.mat.int, tryRC, verbose)
# Parse results and return tax consistent with minBoot
bestHit <- genus.unq[assignment$tax]
boots <- assignment$boot
taxes <- strsplit(bestHit, ";")
taxes <- lapply(seq_along(taxes), function(i) taxes[[i]][boots[i,]>=minBoot])
# Convert to character matrix
tax.out <- matrix(NA_character_, nrow=length(seqs), ncol=td)
for(i in seq(length(seqs))) {
if(length(taxes[[i]]) > 0) {
tax.out[i,1:length(taxes[[i]])] <- taxes[[i]]
}
}
rownames(tax.out) <- seqs
colnames(tax.out) <- taxLevels[1:ncol(tax.out)]
tax.out[tax.out=="_DADA2_UNSPECIFIED"] <- NA_character_
if(outputBootstraps){
# Convert boots to integer matrix
boots.out <- matrix(boots, nrow=length(seqs), ncol=td)
rownames(boots.out) <- seqs
colnames(boots.out) <- taxLevels[1:ncol(boots.out)]
list(tax=tax.out, boot=boots.out)
} else {
tax.out
}
}
# Helper function for assignSpecies
mapHits <- function(x, refs, keep, sep="/") {
hits <- refs[x]
hits[grepl("Escherichia", hits, fixed=TRUE) | grepl("Shigella", hits, fixed=TRUE)] <- "Escherichia/Shigella"
if(length(unique(hits))<=keep) {
rval <- do.call(paste, c(as.list(sort(unique(hits))), sep=sep))
} else { rval <- NA_character_ }
if(length(rval)==0) rval <- NA_character_
rval
}
# Match curated genus names to binomial genus names
# Handles Clostridium groups and split genera names
matchGenera <- function(gen.tax, gen.binom, split.glyph="/") {
if(is.na(gen.tax) || is.na(gen.binom)) { return(FALSE) }
if((gen.tax==gen.binom) ||
grepl(paste0("^", gen.binom, "[ _", split.glyph, "]"), gen.tax) ||
grepl(paste0(split.glyph, gen.binom, "$"), gen.tax)) {
return(TRUE)
} else {
return(FALSE)
}
}
#'
#' Taxonomic assignment to the species level by exact matching.
#'
#' \code{assignSpecies} uses exact matching against a reference fasta to identify the
#' genus-species binomial classification of the input sequences.
#'
#' @param seqs (Required). A character vector of the sequences to be assigned, or an object
#' coercible by \code{\link{getUniques}}.
#'
#' @param refFasta (Required). The path to the reference fasta file, or an
#' R connection. Can be compressed.
#' This reference fasta file should be formatted so that the id lines correspond to the
#' genus-species of the associated sequence:
#'
#' >SeqID genus species
#' ACGAATGTGAAGTAA......
#'
#' @param allowMultiple (Optional). Default FALSE.
#' Defines the behavior when multiple exact matches against different species are returned.
#' By default only unambiguous identifications are return. If TRUE, a concatenated string
#' of all exactly matched species is returned. If an integer is provided, multiple
#' identifications up to that many are returned as a concatenated string.
#'
#' @param tryRC (Optional). Default FALSE.
#' If TRUE, the reverse-complement of each sequences will also be tested for exact matching
#' to the reference sequences.
#'
#' @param n (Optional). Default \code{2000}.
#' The number of sequences to perform assignment on at one time.
#' This controls the peak memory requirement so that large numbers of sequences are supported.
#'
#' @param verbose (Optional). Default FALSE.
#' If TRUE, print status to standard output.
#'
#' @return A two-column character matrix. Rows correspond to the provided sequences,
#' columns to the genus and species taxonomic levels. NA indicates that the sequence
#' was not classified at that level.
#'
#' @export
#'
#' @importFrom Biostrings vcountPDict
#' @importFrom Biostrings PDict
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead sread
#' @importFrom ShortRead reverseComplement
#' @importFrom ShortRead id
#' @importFrom methods as
#'
#' @examples
#' seqs <- getSequences(system.file("extdata", "example_seqs.fa", package="dada2"))
#' species_fasta <- system.file("extdata", "example_species_assignment.fa.gz", package="dada2")
#' spec <- assignSpecies(seqs, species_fasta)
#'
assignSpecies <- function(seqs, refFasta, allowMultiple=FALSE, tryRC=FALSE, n=2000, verbose=FALSE) {
# Define number of multiple species to return
if(is.logical(allowMultiple)) {
if(allowMultiple) keep <- Inf
else keep <- 1
} else {
keep <- as.integer(allowMultiple)
}
# Get character vector of sequences
seqs <- getSequences(seqs)
# Read in the reference fasta
refsr <- readFasta(refFasta)
ids <- as(id(refsr), "character")
# Crude format check
if(!length(unlist(strsplit(ids[[1]], "\\s"))) >= 3) {
if(length(unlist(gregexpr(";", ids[[1]]))) >= 3) {
stop("Incorrect reference file format for assignSpecies (this looks like a file formatted for assignTaxonomy).")
} else {
stop("Incorrect reference file format for assignSpecies.")
}
}
genus <- sapply(strsplit(ids, "\\s"), `[`, 2)
species <- sapply(strsplit(ids, "\\s"), `[`, 3)
# Identify the exact hits
hits <- vector("list", length(seqs))
lens <- nchar(seqs)
for(len in unique(lens)) { # Requires all same length sequences
i.len <- which(lens==len); n.len <- length(i.len)
j.lo<-1; j.hi<-min(n,n.len)
while(j.lo <= n.len) {
i.loop <- i.len[j.lo:j.hi]
seqdict <- PDict(seqs[i.loop])
vhit <- (vcountPDict(seqdict, sread(refsr))>0)
if(tryRC) vhit <- vhit | (vcountPDict(seqdict, reverseComplement(sread(refsr)))>0)
hits[i.loop] <- lapply(seq(nrow(vhit)), function(x) vhit[x,])
j.lo <- j.lo + n; j.hi <- min(j.hi+n, n.len)
rm(seqdict)
gc()
}
}
# Get genus species return strings
rval <- cbind(unlist(sapply(hits, mapHits, refs=genus, keep=1)),
unlist(sapply(hits, mapHits, refs=species, keep=keep)))
colnames(rval) <- c("Genus", "Species")
rownames(rval) <- seqs
gc()
if(verbose) cat(sum(!is.na(rval[,"Species"])), "out of", length(seqs), "were assigned to the species level.\n")
rval
}
#'
#' Add species-level annotation to a taxonomic table.
#'
#' \code{addSpecies} wraps the \code{\link{assignSpecies}} function to assign genus-species
#' binomials to the input sequences by exact matching against a reference fasta. Those binomials
#' are then merged with the input taxonomic table with species annotations appended as an
#' additional column to the input table.
#' Only species identifications where the genera in the input table and the binomial
#' classification are consistent are included in the return table.
#'
#' @param taxtab (Required). A taxonomic table, the output of \code{\link{assignTaxonomy}}.
#'
#' @param refFasta (Required). The path to the reference fasta file, or an
#' R connection. Can be compressed.
#' This reference fasta file should be formatted so that the id lines correspond to the
#' genus-species binomial of the associated sequence:
#'
#' >SeqID genus species
#' ACGAATGTGAAGTAA......
#'
#' @param allowMultiple (Optional). Default FALSE.
#' Defines the behavior when multiple exact matches against different species are returned.
#' By default only unambiguous identifications are return. If TRUE, a concatenated string
#' of all exactly matched species is returned. If an integer is provided, multiple
#' identifications up to that many are returned as a concatenated string.
#'
#' @param tryRC (Optional). Default FALSE.
#' If TRUE, the reverse-complement of each sequences will be used for classification if it is a better match to the reference
#' sequences than the forward sequence.
#'
#' @param n (Optional). Default \code{1e5}.
#' The number of records (reads) to read in and filter at any one time.
#' This controls the peak memory requirement so that very large fastq files are supported.
#' See \code{\link{FastqStreamer}} for details.
#'
#' @param verbose (Optional). Default FALSE.
#' If TRUE, print status to standard output.
#'
#' @return A character matrix one column larger than input. Rows correspond to
#' sequences, and columns to the taxonomic levels. NA indicates that the sequence
#' was not classified at that level.
#'
#' @seealso
#' \code{\link{assignTaxonomy}}, \code{\link{assignSpecies}}
#'
#' @export
#'
#' @examples
#'
#' seqs <- getSequences(system.file("extdata", "example_seqs.fa", package="dada2"))
#' training_fasta <- system.file("extdata", "example_train_set.fa.gz", package="dada2")
#' taxa <- assignTaxonomy(seqs, training_fasta)
#' species_fasta <- system.file("extdata", "example_species_assignment.fa.gz", package="dada2")
#' taxa.spec <- addSpecies(taxa, species_fasta)
#' taxa.spec.multi <- addSpecies(taxa, species_fasta, allowMultiple=TRUE)
#'
addSpecies <- function(taxtab, refFasta, allowMultiple=FALSE, tryRC=FALSE, n=2000, verbose=FALSE) {
seqs <- rownames(taxtab)
binom <- assignSpecies(seqs, refFasta=refFasta, allowMultiple=allowMultiple, tryRC=tryRC, n=n, verbose=verbose)
# Merge tables
if("Genus" %in% colnames(taxtab)) gcol <- which(colnames(taxtab) == "Genus")
else gcol <- ncol(taxtab)
# Match genera
gen.match <- mapply(matchGenera, taxtab[,gcol], binom[,1])
taxtab <- cbind(taxtab, binom[,2])
colnames(taxtab)[ncol(taxtab)] <- "Species"
taxtab[!gen.match,"Species"] <- NA_character_
if(verbose) cat("Of which", sum(!is.na(taxtab[,"Species"])),"had genera consistent with the input table.")
taxtab
}
#' This function creates the dada2 assignTaxonomy training fasta for the RDP trainset .fa file
#' The RDP trainset data was downloaded from: https://sourceforge.net/projects/rdp-classifier/files/RDP_Classifier_TrainingData/
#'
#' ## RDP Trainset 18
#' path <- "~/Desktop/RDP/RDPClassifier_16S_trainsetNo18_rawtrainingdata"
#' dada2:::makeTaxonomyFasta_RDP(file.path(path, "trainset18_062020.fa"),
#' file.path(path, "trainset18_db_taxid.txt"),
#' "~/tax/rdp_train_set_18.fa.gz")
#' dada2:::tax.check("~/tax/rdp_train_set_18.fa.gz", "~/Desktop/ten_16s.100.fa")
#'
#' ## RDP Trainset 16
#' path <- "~/Desktop/RDP/RDPClassifier_16S_trainsetNo16_rawtrainingdata"
#' dada2:::makeTaxonomyFasta_RDP(file.path(path, "trainset16_022016.fa"),
#' file.path(path, "trainset16_db_taxid.txt"),
#' "~/tax/rdp_train_set_16.fa.gz")
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom Biostrings BStringSet
#' @importFrom utils read.table
#' @keywords internal
makeTaxonomyFasta_RDP <- function(fin, fdb, fout, compress=TRUE) {
# Read in the fasta and pull out the taxonomy entries
sr <- readFasta(fin)
id <- as.character(gsub("\"", "", id(sr)))
tax <- sapply(strsplit(id, "\\t"), `[`, 2)
tax <- gsub("^Root;", "", tax)
tax <- strsplit(tax, ";")
# Get the names of the standard 6 taxonomic levels
db <- read.table(file=fdb, sep="*", stringsAsFactors=FALSE)
colnames(db) <- c("Index", "Name", "L", "R", "Level")
keep <- db$Name[db$Level %in% c("domain", "phylum", "class", "order", "family", "genus")]
# Cut down to just the 6 level taxonomy
tax <- sapply(tax, function(x) x[x %in% keep])
tax <- lapply(tax, paste, collapse = ";")
tax <- unlist(tax)
# Final formatting
tax <- paste0(tax, ";") # Ending semicolon
tax <- gsub("[^;]*_incertae_sedis;$", "", tax) # Uncertain lowest-level assignment is better to leave blank
tax <- gsub(" ", "_", tax)
# Write to disk
writeFasta(ShortRead(sread(sr), BStringSet(tax)), fout,
width=20000L, compress=compress)
}
#' This function creates the dada2 assignSpecies fasta file for the RDP
#' from the RDP's _Bacteria_unaligned.fa file.
#'
#' ## RDP Trainset 18/Release 11.5
#' ## The RDP documentation does not make clear whether the updates to the taxonomy from training set release 18 were
#' ## propagated to the current Bacterial alignment.
#' dada2:::makeSpeciesFasta_RDP("~/Desktop/RDP/current_Bacteria_unaligned.fa", "~/tax/rdp_species_assignment_18.fa.gz")
#' dada2:::tax.check("~/tax/rdp_species_assignment_18.fa.gz", "~/Desktop/ten_16s.100.fa", mode="species")
#'
#' ## RDP Trainset 16/Release 11.5
#' dada2:::makeSpeciesFasta_RDP("~/Desktop/RDP/current_Bacteria_unaligned.fa", "~/tax/rdp_species_assignment_16.fa.gz")
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom ShortRead narrow
#' @importFrom IRanges narrow
#' @importFrom Biostrings BStringSet
#' @keywords internal
makeSpeciesFasta_RDP <- function(fin, fout, compress=TRUE) {
# Read in and remove records not assigned to species
sr <- readFasta(fin)
is.uncult <- grepl("[Uu]ncultured", id(sr))
sr <- sr[!is.uncult]
is.unclass <- grepl("[Uu]nclassified", id(sr))
sr <- sr[!is.unclass]
is.outgroup <- (grepl("Outgroup", id(sr)))
sr <- sr[!is.outgroup]
is.unident <- grepl("[Uu]nidentified", id(sr))
sr <- sr[!is.unident]
# Pull out the genus species binomial string
binom <- sapply(strsplit(as.character(id(sr)), ";"), `[`, 1)
binom <- sapply(strsplit(binom, "\\t"), `[`, 1)
binom <- gsub(" \\(T\\)", "", binom)
binom <- gsub("\\[", "", binom)
binom <- gsub("\\]", "", binom)
# Match genera between binomial and the curated taxonomy
bar <- strsplit(as.character(id(sr)), ";")
barlens <- sapply(bar, length)
geni <- mapply(function(x,y) x[[y]], bar, barlens-1)
# Get rid of SXXX id
binom <- gsub("^S[0123456789]{9} ", "", binom)
binom <- gsub("\'" , "", binom)
# Drop Candidatus strings
binom <- gsub("Candidatus ", "", binom)
geni <- gsub("Candidatus ", "", geni)
# Subset down to those binomials which match the curated genus
binom.geni <- sapply(strsplit(binom, "\\s"), `[`, 1)
gen.match <- mapply(matchGenera, geni, binom.geni)
sr <- sr[gen.match]
binom <- binom[gen.match]
geni <- geni[gen.match]
# Make matrix of genus/species
binom[sapply(strsplit(binom, "\\s"), length)==1] <- paste(binom[sapply(strsplit(binom, "\\s"), length)==1], "sp.")
binom2 <- cbind(sapply(strsplit(binom, "\\s"), `[`, 1),
sapply(strsplit(binom, "\\s"), `[`, 2))
# Keep only those with a named species
has.spec <- !grepl("sp\\.", binom2[,2])
sum(has.spec)
binom2 <- binom2[has.spec,]
sr <- sr[has.spec]
binom <- binom[has.spec]
geni <- geni[has.spec]
cat(length(binom), "sequences with genus/species binomial annotation output.\n")
# Write to disk
ids <- as.character(narrow(id(sr),1,10))
writeFasta(ShortRead(sread(sr), BStringSet(paste(ids, binom))), fout,
width=20000L, compress=compress)
}
#' This function creates the dada2 assignTaxonomy training fasta for the official Silva NR99
#' release files. If `include.species`=TRUE, a 7th taxonomic level (species) will be added based on the
#' Genus species binomial in the Silva taxonomy string, if present and valid.
#'
#' ## Silva release v138
#' path <- "~/tax/Silva/v138"
#' dada2:::makeTaxonomyFasta_SilvaNR(file.path(path, "SILVA_138_SSURef_NR99_tax_silva.fasta.gz"),
#' file.path(path, "tax_slv_ssu_138.txt"),
#' "~/Desktop/silva_nr99_v138_train_set.fa.gz")
#' dada2:::makeTaxonomyFasta_SilvaNR(file.path(path, "SILVA_138_SSURef_NR99_tax_silva.fasta.gz"),
#' file.path(path, "tax_slv_ssu_138.txt"),
#' include.species=TRUE, "~/Desktop/silva_nr99_v138_wSpecies_train_set.fa.gz")
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom Biostrings BStringSet
#' @importFrom utils read.table
#' @keywords internal
makeTaxonomyFasta_SilvaNR <- function(fin, ftax, fout, include.species=FALSE, compress=TRUE) {
xset <- DNAStringSet(readRNAStringSet(fin, format="fasta"))
# taxl: The taxonmic strings or (l)ines associated with each entry. Named by the sequence ID/accession.
taxl <- names(xset)
names(taxl) <- sapply(strsplit(names(xset), "\\s"), `[`, 1)
if(any(duplicated(names(taxl)))) stop("Duplicated sequence IDs detected.")
names(xset) <- names(taxl)
taxl <- gsub("^[A-Za-z0-9.]+\\s", "", taxl)
# Fix Silva- or release-specific errors
taxl <- gsub(";YM;", ";", taxl) # YM bacterial suborder included in 138 Release in error (confirmed by Silva)
## taxl <- gsub(";Rahnella1", ";Rahnella", taxl) # Rahnella1 genus seems like an error, shares same species w/ Rahnella,
## But! also in official tax file. Maybe check with Silva on this one.
# taxa: A list of the ordered taxonomic levels corresponding to each reference sequence. Named by the sequence ID/accession.
taxa <- strsplit(taxl, ";")
# Read in the defined Silva taxonomic levels, e.g. Bacteria;Desulfobacterota;Desulfobulbia;Desulfobulbales;Desulfurivibrionaceae;
silva.taxa <- read.table(ftax, sep="\t", col.names=c("Taxon", "V2", "Level", "V4", "V5"), stringsAsFactors=FALSE)
silva.taxa <- silva.taxa[,c("Taxon", "Level")]
# Subset down to Bacteria and Archaea
kingdom <- sapply(strsplit(taxl, ";"), `[`, 1)
taxl.ba <- taxl[kingdom %in% c("Bacteria", "Archaea")]
taxa.ba <- taxa[names(taxl.ba)]
# Create 6-column matrix with Silva taxonomic assignment for each sequence at each level from Kingdom to Genus (NA if no assignment)
taxa.ba.mat <- matrix(sapply(taxa.ba, function(flds) {
c(flds[1], flds[2], flds[3], flds[4], flds[5], flds[6])
}), ncol=6, byrow=TRUE)
rownames(taxa.ba.mat) <- names(taxl.ba)
# Create 6-column matrix with full Silva taxonomic string at each level for each sequence, from Kingdom to Genus
# Strings will include NA levels if no assignment at that level, e.g. Bacteria;Firmicutes;NA;NA
taxa.ba.mat.string <- matrix("UNDEF", nrow=nrow(taxa.ba.mat), ncol=ncol(taxa.ba.mat))
rownames(taxa.ba.mat.string) <- names(taxl.ba)
taxa.ba.mat.string[,1] <- paste0(taxa.ba.mat[,1],";")
for(col in seq(2,6)) {
taxa.ba.mat.string[,col] <- paste0(taxa.ba.mat.string[,col-1], taxa.ba.mat[,col],";")
}
if(any(taxa.ba.mat.string == "UNDEF")) stop("Taxon string matrix was not fully initialized.")
# Define the set of valid taxonomic assignment by their appearance in the list of valid Silva taxonomic levels
taxa.ba.mat.is_valid <- matrix(taxa.ba.mat.string %in% silva.taxa$Taxon, ncol=6)
# Update taxa.ba.mat matrix by replacing invalid entries with NAs
taxa.ba.mat[!taxa.ba.mat.is_valid] <- NA
# Also replace "uncultured" taxonomic ranks with NAs (note, uncultured only shows up as the terminal "assigned" rank)
taxa.ba.mat[taxa.ba.mat %in% c("Uncultured", "uncultured")] <- NA
######### ADD SPECIES PART HERE ##############
if(include.species) {
# Add the 7th column, which will be the species column
taxa.ba.mat <- cbind(taxa.ba.mat,
matrix(sapply(taxa.ba, `[`, 7), ncol=1, byrow=TRUE))
# Get validated genus from the matrix
genus <- taxa.ba.mat[,6]
genus <- gsub("Candidatus ", "", genus)
genus <- gsub("\\[", "", genus)
genus <- gsub("\\]", "", genus)
# Get the "binomial" string from the 7th field in the Silva taxonomic annotation
# The "binomial" field is not curated like the other Silva taxonomic levels, and can have varying info
# We assume that the first two words are the Genus species binomial, when there is a valid one in the field
# NOTE: the binomial is actually not always in the 7th field, so this isn't strictly correct.
# the binomial is in the "last" field, which may be <7 when not all the levels down to genus are assigned.
# But we are throwing away everything that doesn't match the genus anyway, so that case
# doesn't need to be handled correctly here.
binom <- taxa.ba.mat[,7]
binom <- gsub("Candidatus ", "", binom)
binom <- gsub("\\[", "", binom)
binom <- gsub("\\]", "", binom)
# Pull out the first two fields, and turn binom into a two column matrix (Genus, species)
binom <- cbind(sapply(strsplit(binom, "\\s"), `[`, 1),
sapply(strsplit(binom, "\\s"), `[`, 2))
# Identify binomials that match the curated genus
gen.match <- mapply(dada2:::matchGenera, genus, binom[,1], split.glyph="-")
# Identify some other types of invalid species names
is.NA <- apply(binom, 1, function(x) any(is.na(x)))
is.sp <- grepl("sp\\.", binom[,2]) # "sp." is not a valid species name, just a generic
is.endo <- binom[,1] %in% "endosymbiont" | binom[,2] %in% "endosymbiont"
is.uncult <- grepl("[Uu]ncultured", binom[,1]) | grepl("[Uu]ncultured", binom[,2])
is.unident <- grepl("[Uu]nidentified", binom[,1]) | grepl("[Uu]nidentified", binom[,2])
# Define the "valid" species, and set invalid species to NA in the taxonomic matrix
valid.spec <- gen.match & !is.NA & !is.sp & !is.endo & !is.uncult & !is.unident
binom[!valid.spec,2] <- NA
taxa.ba.mat[,7] <- binom[,2]
}
# Organize a small number of Eukaryota sequences for outgroup purposes, keeping only the Eukaryota Kingdom taxonomic assignment
set.seed(100); N_EUK <- 100
euk.keep <- sample(names(taxl)[kingdom %in% "Eukaryota"], N_EUK)
taxa.euk.mat <- matrix("", nrow=N_EUK, ncol=ncol(taxa.ba.mat))
rownames(taxa.euk.mat) <- euk.keep
taxa.euk.mat[,1] <- "Eukaryota"
taxa.euk.mat[,2:ncol(taxa.euk.mat)] <- NA
# Now need to make the final training fasta in DADA2 format.
taxa.mat.final <- rbind(taxa.ba.mat, taxa.euk.mat)
taxa.string.final <- apply(taxa.mat.final, 1, function(x) {
tst <- paste(x, collapse=";")
tst <- paste0(tst, ";")
tst <- gsub("NA;", "", tst)
tst
})
if(any(is.na(names(taxa.string.final)))) stop("NA names in the final set of taxon strings.")
if(!all(names(taxa.string.final) %in% names(xset))) stop("Some names of the final set of taxon strings don't match sequence names.")
xset.out <- xset[names(taxa.string.final)]
## Add some verbose output describing what happened.
cat(length(xset.out), "reference sequences were output.\n")
print(table(taxa.mat.final[,1], useNA="ifany"))
if(include.species) cat(sum(!is.na(taxa.mat.final[,7])), "entries include species names.\n")
writeFasta(ShortRead(unname(xset.out), BStringSet(taxa.string.final)), fout,
width=20000L, compress=compress)
}
#' DEPRECATED in favor of `makeTaxonomyFasta_SilvaNR``
#'
#' This function creates the dada2 assignTaxonomy training fasta for the Silva .align file
#' generated by the Mothur project.
#'
#' ## Silva release v128
#' path <- "~/Desktop/Silva/Silva.nr_v128"
#' dada2:::makeTaxonomyFasta_Silva(file.path(path, "silva.nr_v128.align"),
#' file.path(path, "silva.nr_v128.tax"),
#' "~/tax/silva_nr_v128_train_set.fa.gz")
#'
#' ## Silva release v132
#' path <- "~/Desktop/Silva/Silva.nr_v132"
#' dada2:::makeTaxonomyFasta_Silva(file.path(path, "silva.nr_v132.align"),
#' file.path(path, "silva.nr_v132.tax"),
#' "~/tax/silva_nr_v132_train_set.fa.gz")
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom Biostrings BStringSet
#' @importFrom utils read.table
#' @keywords internal
makeTaxonomyFasta_Silva <- function(fin, ftax, fout, compress=TRUE) {
# Read in the fasta and pull out the taxonomy entries
sr <- readFasta(fin) # ~10GB to read in
ids <- sapply(strsplit(as.character(id(sr)), "\\t"), `[`, 1)
seqs <- gsub("[.-]", "", sread(sr)) # Takes a while
rm(sr);gc()
# Read in the taxnoomy file
taxdf <- read.table(ftax, sep="\t", header=FALSE, stringsAsFactors = FALSE)
colnames(taxdf) <- c("id", "tax")
taxdf$tax <- gsub("^\\s+|\\s+$", "", taxdf$tax)
if(!identical(taxdf$id, ids)) stop("Input align and taxonomy files don't match.")
# Final formatting
tax <- taxdf$tax
tax <- gsub("Escherichia-Shigella", "Escherichia/Shigella", tax)
# Remove faux-assignments added by new Mothur processing script
tax <- gsub("[^;]*_ge;$", "", tax)
tax <- gsub("[^;]*_fa;$", "", tax)
tax <- gsub("[^;]*_or;$", "", tax)
tax <- gsub("[^;]*_cl;$", "", tax)
tax <- gsub("[^;]*_ph;$", "", tax)
tax <- gsub(";uncultured;$", ";", tax)
# Write to disk
writeFasta(ShortRead(DNAStringSet(seqs), BStringSet(tax)), fout,
width=20000L, compress=compress)
}
#' This function creates the dada2 assignSpecies fasta file for Silva
#' from the SILVA_[VERSION]_SSURef_tax_silva.fasta file
#'
#' ## Silva release v128
#' dada2:::makeSpeciesFasta_Silva("~/Desktop/Silva/SILVA_128_SSURef_tax_silva.fasta.gz",
#' "~/tax/silva_species_assignment_v128.fa.gz")
#'
#' ## Silva release v132
#' dada2:::makeSpeciesFasta_Silva("~/Desktop/Silva/SILVA_132_SSURef_tax_silva.fasta.gz",
#' "~/tax/silva_species_assignment_v132.fa.gz")
#'
#' Output: 313502 sequences with genus/species binomial annotation output.
#'
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead writeFasta
#' @importFrom ShortRead sread
#' @importFrom Biostrings BStringSet
#' @importFrom Biostrings DNAStringSet
#' @importFrom Biostrings readRNAStringSet
#' @keywords internal
makeSpeciesFasta_Silva <- function(fin, fout, compress=TRUE) {
# Read in and remove records not assigned to species and non-bacteria
xset <- DNAStringSet(readRNAStringSet(fin, format="fasta"))
is.bact <- grepl("Bacteria;", names(xset), fixed=TRUE)
xset <- xset[is.bact]
is.uncult <- grepl("[Uu]ncultured", names(xset))
xset <- xset[!is.uncult]
is.unident <- grepl("[Uu]nidentified", names(xset))
xset <- xset[!is.unident]
is.complete <- sapply(strsplit(as.character(names(xset)), ";"), length)==7
xset <- xset[is.complete]
# Pull out binomial strings
binom <- strsplit(as.character(names(xset)), ";")
genus <- sapply(binom, `[`, 6)
binom <- sapply(binom, `[`, 7)
genus <- gsub("Candidatus ", "", genus)
binom <- gsub("Candidatus ", "", binom)
genus <- gsub("\\[", "", genus)
genus <- gsub("\\]", "", genus)
binom <- gsub("\\[", "", binom)
binom <- gsub("\\]", "", binom)
# Subset down to those binomials which match the curated genus
genus.binom <- sapply(strsplit(binom, "\\s"), `[`, 1)
gen.match <- mapply(matchGenera, genus, genus.binom, split.glyph="-")
# Note that raw Silva files use Escherichia-Shigella, but this is changed to Escherichia/Shigella in dada2 version
xset <- xset[gen.match]
binom <- binom[gen.match]
genus <- genus[gen.match]
# Make matrix of genus/species
binom[sapply(strsplit(binom, "\\s"), length)==1] <- paste(binom[sapply(strsplit(binom, "\\s"), length)==1], "sp.")
binom2 <- cbind(sapply(strsplit(binom, "\\s"), `[`, 1),
sapply(strsplit(binom, "\\s"), `[`, 2))
# Keep only those with a named species
has.spec <- !grepl("sp\\.", binom2[,2]) & !(binom2[,2]=="endosymbiont")
binom2 <- binom2[has.spec,]
xset <- xset[has.spec]
binom <- binom[has.spec]
genus <- genus[has.spec]
cat(length(binom), "sequences with genus/species binomial annotation output.\n")
# Write to disk
ids <- sapply(strsplit(as.character(names(xset)), "\\s"), `[`, 1)
writeFasta(ShortRead(unname(xset), BStringSet(paste(ids, binom))), fout,
width=20000L, compress=compress)
}
## This uses the "ten_16s.100.fa" originally from Robert Edgar's taxonomy testing page: https://drive5.com/taxxi/doc/fasta_index.html
## This file is relicensed here under the DADA2 LGPL2 license on permission from Robert Edgar.
tax.check <- function(fn.tax, fn.test=system.file("extdata", "example_seqs.fa", package="dada2"), nseq=100, level=6, mode="taxonomy") {
sq.test <- sample(getSequences(fn.test), nseq)
if(mode == "taxonomy") {
tax <- assignTaxonomy(sq.test, fn.tax, multi=TRUE)
cbind(unname(tax[,level]), sapply(strsplit(names(sq.test), ":"), `[`, level+1))
} else if (mode=="species") {
sq.acgt <- sq.test[dada2:::C_isACGT(sq.test)]
spc <- assignSpecies(sq.acgt, fn.spc)
cbind(unname(spc[,level-5]), sapply(strsplit(names(sq.acgt), ":"), `[`, level+1))
} else { stop("Valid modes are taxonomy or species.") }
}