Skip to content

Commit 43a1ce9

Browse files
committed
Use option parting infrastucture for xml_read options as well
1 parent 7610cd0 commit 43a1ce9

File tree

6 files changed

+160
-73
lines changed

6 files changed

+160
-73
lines changed

R/RcppExports.R

+4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ read_connection_ <- function(con, chunk_size = 64 * 1024L) {
55
.Call('xml2_read_connection_', PACKAGE = 'xml2', con, chunk_size)
66
}
77

8+
xml_parse_options <- function() {
9+
.Call('xml2_xml_parse_options', PACKAGE = 'xml2')
10+
}
11+
812
doc_parse_file <- function(path, encoding = "", as_html = FALSE, options = 0L) {
913
.Call('xml2_doc_parse_file', PACKAGE = 'xml2', path, encoding, as_html, options)
1014
}

R/utils.R

+26
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,29 @@ describe_options <- function(x) {
4040
paste0(" \\item{", names(x), "}{", attr(x, "description"), "}", collapse = "\n"),
4141
"\n}")
4242
}
43+
44+
s_quote <- function(x) paste0("'", x, "'")
45+
46+
# Similar to match.arg, but returns character() with NULL or empty input and
47+
# errors if any of the inputs are not found (fixing
48+
# https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=16659)
49+
parse_options <- function(arg, options) {
50+
if (is.numeric(arg)) {
51+
return(as.integer(arg))
52+
}
53+
54+
if (is.null(arg) || !nzchar(arg)) {
55+
return(0L)
56+
}
57+
58+
# set duplicates.ok = TRUE so any duplicates are counted differently than
59+
# non-matches, then take only unique results
60+
i <- pmatch(arg, names(options), duplicates.ok = TRUE)
61+
if (any(is.na(i))) {
62+
stop(sprintf("`options` %s is not a valid option, should be one of %s",
63+
s_quote(arg[is.na(i)][1L]),
64+
paste(s_quote(names(options)), collapse = ", ")),
65+
call. = FALSE)
66+
}
67+
sum(options[unique(i)])
68+
}

R/xml_parse.R

+10-70
Original file line numberDiff line numberDiff line change
@@ -22,33 +22,8 @@
2222
#' iteration. Defaults to 64kb.
2323
#' @param verbose When reading from a slow connection, this prints some
2424
#' output on every iteration so you know its working.
25-
#' @param options Set parsing options for the libxml2 parser. These are
26-
#' specified as a character vector of options to set. Available values are
27-
#' \describe{
28-
#' \item{RECOVER}{recover on errors}
29-
#' \item{NOENT}{substitute entities}
30-
#' \item{DTDLOAD}{load the external subset}
31-
#' \item{DTDATTR}{default DTD attributes}
32-
#' \item{DTDVALID}{validate with the DTD}
33-
#' \item{NOERROR}{suppress error reports}
34-
#' \item{NOWARNING}{suppress warning reports}
35-
#' \item{PEDANTIC}{pedantic error reporting}
36-
#' \item{NOBLANKS}{remove blank nodes}
37-
#' \item{SAX1}{use the SAX1 interface internally}
38-
#' \item{XINCLUDE}{Implement XInclude substitition}
39-
#' \item{NONET}{Forbid network access}
40-
#' \item{NODICT}{Do not reuse the context dictionary}
41-
#' \item{NSCLEAN}{remove redundant namespaces declarations}
42-
#' \item{NOCDATA}{merge CDATA as text nodes}
43-
#' \item{NOXINCNODE}{do not generate XINCLUDE START/END nodes}
44-
#' \item{COMPACT}{compact small text nodes; no modification of the tree allowed afterwards (will possibly crash if you try to modify the tree)}
45-
#' \item{OLD10}{parse using XML-1.0 before update 5}
46-
#' \item{NOBASEFIX}{do not fixup XINCLUDE xml:base uris}
47-
#' \item{HUGE}{relax any hardcoded limit from the parser}
48-
#' \item{OLDSAX}{parse using SAX2 interface before 2.7.0}
49-
#' \item{IGNORE_ENC}{ignore internal document encoding hint}
50-
#' \item{BIG_LINES}{Store big lines numbers in text PSVI field}
51-
#' }
25+
#' @param options Set parsing options for the libxml2 parser. Zero of more of
26+
#' \Sexpr[results=rd]{xml2:::describe_options(xml2:::xml_parse_options())}
5227
#' @return An XML document. HTML is normalised to valid XML - this may not
5328
#' be exactly the same transformation performed by the browser, but it's
5429
#' a reasonable approximation.
@@ -77,6 +52,8 @@ read_html <- function(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "
7752

7853
#' @export
7954
read_html.default <- function(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS")) {
55+
options <- parse_options(options, xml_parse_options())
56+
8057
suppressWarnings(read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options))
8158
}
8259

@@ -85,6 +62,7 @@ read_html.response <- function(x, encoding = "", options = c("RECOVER",
8562
"NOERROR", "NOBLANKS"), ...) {
8663
need_package("httr")
8764

65+
options <- parse_options(options, xml_parse_options())
8866
content <- httr::content(x, as = "raw")
8967
xml2::read_html(content, encoding = encoding, options = options, ...)
9068
}
@@ -94,7 +72,7 @@ read_html.response <- function(x, encoding = "", options = c("RECOVER",
9472
read_xml.character <- function(x, encoding = "", ..., as_html = FALSE,
9573
options = "NOBLANKS") {
9674

97-
options <- parse_options(options)
75+
options <- parse_options(options, xml_parse_options())
9876
if (grepl("<|>", x)) {
9977
read_xml.raw(charToRaw(enc2utf8(x)), "UTF-8", ..., as_html = as_html, options = options)
10078
} else {
@@ -114,7 +92,7 @@ read_xml.character <- function(x, encoding = "", ..., as_html = FALSE,
11492
#' @rdname read_xml
11593
read_xml.raw <- function(x, encoding = "", base_url = "", ...,
11694
as_html = FALSE, options = "NOBLANKS") {
117-
options <- parse_options(options)
95+
options <- parse_options(options, xml_parse_options())
11896

11997
doc <- doc_parse_raw(x, encoding = encoding, base_url = base_url,
12098
as_html = as_html, options = options)
@@ -126,6 +104,8 @@ read_xml.raw <- function(x, encoding = "", base_url = "", ...,
126104
read_xml.connection <- function(x, encoding = "", n = 64 * 1024,
127105
verbose = FALSE, ..., base_url = "",
128106
as_html = FALSE, options = "NOBLANKS") {
107+
options <- parse_options(options, xml_parse_options())
108+
129109
if (!isOpen(x)) {
130110
open(x, "rb")
131111
on.exit(close(x))
@@ -141,48 +121,8 @@ read_xml.response <- function(x, encoding = "", base_url = "", ...,
141121
as_html = FALSE, options = "NOBLANKS") {
142122
need_package("httr")
143123

124+
options <- parse_options(options, xml_parse_options())
144125
content <- httr::content(x, as = "raw")
145126
xml2::read_xml(content, encoding = encoding, base_url = base_url,
146127
as_html = as_html, option = options, ...)
147128
}
148-
149-
`%<<%` <- function(a, n) bitwShiftL(a, n)
150-
151-
# http://xmlsoft.org/html/libxml-parser.html#xmlParserOption
152-
parser_options <- c(
153-
"RECOVER" = 1 %<<% 0,
154-
"NOENT" = 1 %<<% 1,
155-
"DTDLOAD" = 1 %<<% 2,
156-
"DTDATTR" = 1 %<<% 3,
157-
"DTDVALID" = 1 %<<% 4,
158-
"NOERROR" = 1 %<<% 5,
159-
"NOWARNING" = 1 %<<% 6,
160-
"PEDANTIC" = 1 %<<% 7,
161-
"NOBLANKS" = 1 %<<% 8,
162-
"SAX1" = 1 %<<% 9,
163-
"XINCLUDE" = 1 %<<% 10,
164-
"NONET" = 1 %<<% 11,
165-
"NODICT" = 1 %<<% 12,
166-
"NSCLEAN" = 1 %<<% 13,
167-
"NOCDATA" = 1 %<<% 14,
168-
"NOXINCNODE" = 1 %<<% 15,
169-
"COMPACT" = 1 %<<% 16,
170-
"OLD10" = 1 %<<% 17,
171-
"NOBASEFIX" = 1 %<<% 18,
172-
"HUGE" = 1 %<<% 19,
173-
"OLDSAX" = 1 %<<% 20,
174-
"OLDSAX" = 1 %<<% 20,
175-
"IGNORE_ENC" = 1 %<<% 21,
176-
"BIG_LINES" = 1 %<<% 22)
177-
178-
parse_options <- function(options) {
179-
if (is.numeric(options)) {
180-
return(options)
181-
}
182-
mtch <- pmatch(options, names(parser_options))
183-
if (any(is.na(mtch))) {
184-
stop("`options` ", options[is.na(mtch)][1L], " is not a valid option", call. = FALSE)
185-
}
186-
187-
sum(parser_options[mtch])
188-
}

src/RcppExports.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,16 @@ BEGIN_RCPP
1818
return rcpp_result_gen;
1919
END_RCPP
2020
}
21+
// xml_parse_options
22+
Rcpp::IntegerVector xml_parse_options();
23+
RcppExport SEXP xml2_xml_parse_options() {
24+
BEGIN_RCPP
25+
Rcpp::RObject rcpp_result_gen;
26+
Rcpp::RNGScope rcpp_rngScope_gen;
27+
rcpp_result_gen = Rcpp::wrap(xml_parse_options());
28+
return rcpp_result_gen;
29+
END_RCPP
30+
}
2131
// doc_parse_file
2232
XPtrDoc doc_parse_file(std::string path, std::string encoding, bool as_html, int options);
2333
RcppExport SEXP xml2_doc_parse_file(SEXP pathSEXP, SEXP encodingSEXP, SEXP as_htmlSEXP, SEXP optionsSEXP) {

src/xml2_doc.cpp

+96
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,102 @@ using namespace Rcpp;
66
#include "xml2_types.h"
77
#include "xml2_utils.h"
88

9+
// [[Rcpp::export]]
10+
Rcpp::IntegerVector xml_parse_options() {
11+
const char * names[] = {
12+
"RECOVER",
13+
"NOENT",
14+
"DTDLOAD",
15+
"DTDATTR",
16+
"DTDVALID",
17+
"NOERROR",
18+
"NOWARNING",
19+
"PEDANTIC",
20+
"NOBLANKS",
21+
"SAX1",
22+
"XINCLUDE",
23+
"NONET",
24+
"NODICT",
25+
"NSCLEAN",
26+
"NOCDATA",
27+
"NOXINCNODE",
28+
"COMPACT",
29+
"OLD10",
30+
"NOBASEFIX",
31+
"HUGE",
32+
"OLDSAX",
33+
"IGNORE_ENC",
34+
"BIG_LINES",
35+
};
36+
37+
const int values[] = {
38+
XML_PARSE_RECOVER,
39+
XML_PARSE_NOENT,
40+
XML_PARSE_DTDLOAD,
41+
XML_PARSE_DTDATTR,
42+
XML_PARSE_DTDVALID,
43+
XML_PARSE_NOERROR,
44+
XML_PARSE_NOWARNING,
45+
XML_PARSE_PEDANTIC,
46+
XML_PARSE_NOBLANKS,
47+
XML_PARSE_SAX1,
48+
XML_PARSE_XINCLUDE,
49+
XML_PARSE_NONET,
50+
XML_PARSE_NODICT,
51+
XML_PARSE_NSCLEAN,
52+
XML_PARSE_NOCDATA,
53+
XML_PARSE_NOXINCNODE,
54+
XML_PARSE_COMPACT,
55+
XML_PARSE_OLD10,
56+
XML_PARSE_NOBASEFIX,
57+
XML_PARSE_HUGE,
58+
XML_PARSE_OLDSAX,
59+
XML_PARSE_IGNORE_ENC,
60+
XML_PARSE_BIG_LINES,
61+
};
62+
63+
const char * descriptions[] = {
64+
"recover on errors",
65+
"substitute entities",
66+
"load the external subset",
67+
"default DTD attributes",
68+
"validate with the DTD",
69+
"suppress error reports",
70+
"suppress warning reports",
71+
"pedantic error reporting",
72+
"remove blank nodes",
73+
"use the SAX1 interface internally",
74+
"Implement XInclude substitition",
75+
"Forbid network access",
76+
"Do not reuse the context dictionary",
77+
"remove redundant namespaces declarations",
78+
"merge CDATA as text nodes",
79+
"do not generate XINCLUDE START/END nodes",
80+
"compact small text nodes; no modification of the tree allowed afterwards (will possibly crash if you try to modify the tree)",
81+
"parse using XML-1.0 before update 5",
82+
"do not fixup XINCLUDE xml:base uris",
83+
"relax any hardcoded limit from the parser",
84+
"parse using SAX2 interface before 2.7.0",
85+
"ignore internal document encoding hint",
86+
"Store big lines numbers in text PSVI field",
87+
};
88+
89+
size_t size = sizeof(values) / sizeof(values[0]);
90+
91+
Rcpp::IntegerVector out_values = Rcpp::IntegerVector(size);
92+
Rcpp::CharacterVector out_names = Rcpp::CharacterVector(size);
93+
Rcpp::CharacterVector out_descriptions = Rcpp::CharacterVector(size);
94+
for (int i = 0; i < size; ++i) {
95+
out_values[i] = values[i];
96+
out_names[i] = names[i];
97+
out_descriptions[i] = descriptions[i];
98+
}
99+
out_values.attr("names") = out_names;
100+
out_values.attr("descriptions") = out_descriptions;
101+
102+
return out_values;
103+
}
104+
9105
// [[Rcpp::export]]
10106
XPtrDoc doc_parse_file(std::string path,
11107
std::string encoding = "",

tests/testthat/test-read-xml.R

+14-3
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,22 @@ test_that("read_html correctly parses malformed document", {
66
})
77

88
test_that("parse_options errors when given an invalid option", {
9-
expect_error(parse_options("INVALID"),
10-
"`options` INVALID is not a valid option")
9+
expect_error(parse_options("INVALID", xml_parse_options()),
10+
"`options` 'INVALID' is not a valid option")
1111

1212
expect_error(read_html("lego.html.bz2", options = "INVALID"),
13-
"`options` INVALID is not a valid option")
13+
"`options` 'INVALID' is not a valid option")
14+
15+
# Empty inputs returned as 0
16+
expect_identical(0L, parse_options("", xml_parse_options()))
17+
expect_identical(0L, parse_options(NULL, xml_parse_options()))
18+
19+
# Numerics returned as integers
20+
expect_identical(12L, parse_options(12L, xml_parse_options()))
21+
expect_identical(12L, parse_options(12, xml_parse_options()))
22+
23+
# Multiple inputs summed
24+
expect_identical(3L, parse_options(c("RECOVER", "NOENT"), xml_parse_options()))
1425
})
1526

1627
test_that("read_html properly passes parser arguments", {

0 commit comments

Comments
 (0)