-
Notifications
You must be signed in to change notification settings - Fork 3
/
tokeniser.R
64 lines (60 loc) · 1.25 KB
/
tokeniser.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#' Tokenizers Words
#'
#' Word tokenizer
#'
#' @param sentence Sentence to tokenize.
#'
#' @examples
#' \dontrun{
#' word_tokenize("This is an R package.")
#' }
#'
#' @import assertthat
#'
#' @export
word_tokenize <- function(sentence) {
assert_that(!missing(sentence), msg = "Missing `sentence`")
nltk$word_tokenize(sentence)
}
#' POS Tagger
#'
#' Parts of speech tagger
#'
#' @param tokens Sentence to tokenize.
#' @param to_r Whether to return results in tidy format.
#'
#' @examples
#' \dontrun{
#' tokens <- word_tokenize("This is an R package.")
#' pos_tag(tokens)
#' }
#'
#' @export
pos_tag <- function(tokens, to_r = FALSE) {
assert_that(!missing(tokens), msg = "Missing `tokens`")
pos <- nltk$pos_tag(tokens)
if(to_r)
pos <- pos %>%
reticulate::py_to_r() %>%
purrr::map(purrr::set_names, c("word", "tag")) %>%
purrr::map_dfr(tibble::as_tibble)
return(pos)
}
#' Named Entity Extraction
#'
#' Named Entity Extraction
#'
#' @param pos POS tagged to tokenize.
#'
#' @examples
#' \dontrun{
#' tokens <- word_tokenize("This is an R package.")
#' tagged <- pos_tag(tokens)
#' ne_chunk(tagged)
#' }
#'
#' @export
ne_chunk <- function(pos) {
assert_that(!missing(pos), msg = "Missing `pos`")
nltk$chunk$ne_chunk(pos)
}