-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlcvp_match.R
217 lines (208 loc) · 8.94 KB
/
lcvp_match.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#' Match two name lists using the Leipzig Catalogue of Plants (LCVP)
#'
#'
#' Matches and compares two name lists based on the taxonomic resolution of
#' plant taxa names listed in the "Leipzig Catalogue of Vascular Plants" (LCVP).
#'
#' @param splist1 A character vector specifying the reference input taxon to be matched.
#' Each element including genus and specific epithet and, potentially,
#' infraspecific rank, infraspecific name, and author name. Only valid characters are allowed
#' (see \code{\link[base:validEnc]{base:validEnc}}).
#'
#' @param splist2 A character vector specifying the input taxon to match splist1.
#' Each element including genus and specific epithet and, potentially, infraspecific rank,
#' infraspecific name, and author name. Only valid characters are allowed
#' (see \code{\link[base:validEnc]{base:validEnc}}).
#'
#'@param max_distance It represents the maximum string distance allowed for a
#' match when comparing the submitted name with the closest name matches in the
#' LCVP. The distance used is a generalized Levenshtein distance that indicates
#' the total number of insertions, deletions, and substitutions allowed to
#' match the two names. It can be expressed as an integer or as the fraction of
#' the binomial name. For example, a name with length 10, and a max_distance =
#' 0.1, allow only one change (insertion, deletion, or substitution). A
#' max_distance = 2, allows two changes.
#'
#'@param genus_fuzzy If TRUE, the fuzzy match algorithm based on max_distance
#' will also be applied to the genus (note that this may considerably increase
#' computational time). If FALSE, fuzzy match will only apply to the epithet.
#'
#'@param grammar_check if TRUE, the algorithm will try to fix common latin
#'grammar mistakes.
#'
#' @param include_all If \code{TRUE} (default), it will include all species in both
#' \code{splist1} and \code{splist2}. If \code{FALSE}, it will exclude species
#' only found in \code{splist2}.
#'
#' @param identify_dups If \code{TRUE} (default), a column indicating the position
#' of duplicated LCVP output names in the resulting data.frame.
#'
#'
#' @return
#' A data.frame with the following columns:
#'
#' \itemize{
#' \item{\emph{Species.List.1}}{: Taxa name list provided by the user in the
#' splist1.}
#' \item{\emph{Species.List.2}}{: Taxa name list provided by the user in the
#' splist2.}
##' \item{global.Id}{The fixed species id of the input taxon in the
#' Leipzig Catalogue of Vascular Plants (LCVP).}
#' \item{Input.Genus}{A
#' character vector. The input genus of the corresponding vascular plant
#' species name listed in LCVP.}
#' \item{Input.Epitheton}{A character vector.
#' The input epitheton of the corresponding vascular plant species name listed
#' in LCVP.}
#' \item{Rank}{A character vector. The taxonomic rank ("species",
#' subspecies: "subsp.", variety: "var.", subvariety: "subvar.", "forma", or
#' subforma: "subf.") of the corresponding vascular plant species name listed
#' in LCVP.}
#' \item{Input.Subspecies.Epitheton}{A character vector. If the
#' indicated rank is below species, the subspecies epitheton input of the
#' corresponding vascular plant species name listed in LCVP. If the rank is
#' "species", the input is "nil".}
#' \item{Input.Authors}{A character vector.
#' The taxonomic authority input of the corresponding vascular plant species
#' name listed in LCVP.}
#' \item{Status}{A character vector. description if a
#' taxon is classified as ‘valid’, ‘synonym’, ‘unresolved’, ‘external’ or
#' ‘blanks’. The ‘unresolved’ rank means that the status of the plant name
#' could be either valid or synonym, but the information available does not
#' allow a definitive decision. ‘External’ in an extra rank which lists names
#' outside the scope of this publication but useful to keep on this updated
#' list. ‘Blanks’ means that the respective name exists in bibliography but it
#' is neither clear where it came from valid, synonym or unresolved. (see the
#' main text Freiberg et al. for more details)}
#' \item{globalId.of.Output.Taxon}{The fixed species id of the output taxon
#' in LCVP.}
#' \item{Output.Taxon}{A character vector. The list of the accepted
#' plant taxa names according to the LCVP.}
#' \item{Family}{A character vector.
#' The corresponding family name of the Input.Taxon, staying empty if the
#' Status is unresolved.}
#' \item{Order}{A character vector. The corresponding
#' order name of the Input.Taxon, staying empty if the Status is unresolved.}
#' \item{Literature}{A character vector. The bibliography used.}
#' \item{Comments}{A character vector. Further taxonomic comments.}
#' \item{\emph{Match.Position.2to1}}{: positions of the names in splist1 in
#' splist2. Can be used to reorder splist2 to match splist1.
#' }
#' \item{\emph{Duplicated.Output.Position}}{: If \code{identify_dups = TRUE}, it
#' indicates the position of duplicated names in LCVP.Output.Taxon column.
#' This may occur if two inputs are now synonyms. It will output NA if there is
#' no duplicated for the species name.
#' }
#'
#' }
#' See \code{\link[LCVP:tab_lcvp]{LCVP:tab_lcvp}} for more details.
#'
#' If \code{include_all = TRUE}, all species will be included. Ordered based on the
#' \code{splist1}, and followed by non-matched names in \code{splist2}.
#' If \code{include_all = FALSE}, non-matched names in \code{splist2} are not
#' included.
#'
#' @author
#' Bruno Vilela & Alexander Ziska
#'
#' @seealso
#' \code{\link[lcvplants:lcvp_join]{lcvp_join}}
#'
#'
#' @references
#' Freiberg, M., Winter, M., Gentile, A. et al. LCVP, The Leipzig
#' catalogue of vascular plants, a new taxonomic reference list for all known
#' vascular plants. Sci Data 7, 416 (2020).
#' https://doi.org/10.1038/s41597-020-00702-z
#'
#' @keywords R-package nomenclature taxonomy vascular plants
#'
#' @examples
#' # Ensure that LCVP package is available before running the example.
#' # If it is not, see the `lcvplants` package vignette for details
#' # on installing the required data package.
#' if (requireNamespace("LCVP", quietly = TRUE)) { # Do not run this
#'
#' # Generate two lists of species name
#' splist1 <- sample(apply(LCVP::tab_lcvp[2:10, 2:3], 1, paste, collapse = " "))
#' splist2 <- sample(apply(LCVP::tab_lcvp[11:3, 2:3], 1, paste, collapse = " "))
#'
#' # Including all species in both lists
#' lcvp_match(splist1, splist2, include_all = TRUE)
#'
#' # Including all species only in the first list
#' matchLists <- lcvp_match(splist1, splist2, include_all = FALSE)
#' ## This can be used to quickly change positions in splist2 to match splist1
#' splist2[matchLists$Match.Position.2to1]
#'
#' }
#'@export
lcvp_match <- function(splist1,
splist2,
max_distance = 0.2,
genus_fuzzy = FALSE,
grammar_check = FALSE,
include_all = TRUE,
identify_dups = TRUE) {
hasData() # Check if LCVP is installed
# Defensive
if (is.factor(splist1)) {
splist1 <- as.character(splist1)
}
if (is.factor(splist2)) {
splist2 <- as.character(splist2)
}
.names_check(splist1, "The first list of species name")
.names_check(splist2, "The second list of species name")
# Run the search
search1 <- lcvp_search(splist = splist1,
max_distance = max_distance,
genus_fuzzy = genus_fuzzy,
grammar_check = grammar_check)
if (is.null(search1)) {
stop(paste("No match found for splist1.",
"Try increasing the 'max_distance' argument."),
call. = FALSE)
}
search2 <- lcvp_search(splist = splist2,
max_distance = max_distance,
genus_fuzzy = genus_fuzzy,
grammar_check = grammar_check)
if (is.null(search2)) {
stop(paste("No match found for splist2.",
"Try increasing the 'max_distance' argument."),
call. = FALSE)
}
# match
Input.Taxon1 <- search1$global.Id
Input.Taxon2 <- search2$global.Id
match_pos <- match(Input.Taxon1,
Input.Taxon2,
incomparables = NA)
# Adjust output
sp2 <- splist2[match_pos]
result <- data.frame(
"Species.List.1" = splist1,
"Species.List.2" = sp2,
search1[, -1],
"Match.Position.2to1" = match_pos
)
## Include species only in second dataset
if (include_all) {
pos_no_match <- which(!(splist2 %in% sp2))
if (length(pos_no_match) > 0) {
sp2_miss <- splist2[pos_no_match]
for (i in seq_along(sp2_miss)) {
extra_lines <- c(NA,
sp2_miss[i],
unlist(search2[pos_no_match[i], -1]),
pos_no_match[i])
result <- rbind(result, extra_lines)
}
}
}
if (identify_dups) {
result$Duplicated.Output.Position <- .find_dups(result, output_pos = 5)
}
return(result)
}