22
22
# ' iteration. Defaults to 64kb.
23
23
# ' @param verbose When reading from a slow connection, this prints some
24
24
# ' output on every iteration so you know its working.
25
- # ' @param options Set parsing options for the libxml2 parser. These are
26
- # ' specified as a character vector of options to set. Available values are
27
- # ' \describe{
28
- # ' \item{RECOVER}{recover on errors}
29
- # ' \item{NOENT}{substitute entities}
30
- # ' \item{DTDLOAD}{load the external subset}
31
- # ' \item{DTDATTR}{default DTD attributes}
32
- # ' \item{DTDVALID}{validate with the DTD}
33
- # ' \item{NOERROR}{suppress error reports}
34
- # ' \item{NOWARNING}{suppress warning reports}
35
- # ' \item{PEDANTIC}{pedantic error reporting}
36
- # ' \item{NOBLANKS}{remove blank nodes}
37
- # ' \item{SAX1}{use the SAX1 interface internally}
38
- # ' \item{XINCLUDE}{Implement XInclude substitition}
39
- # ' \item{NONET}{Forbid network access}
40
- # ' \item{NODICT}{Do not reuse the context dictionary}
41
- # ' \item{NSCLEAN}{remove redundant namespaces declarations}
42
- # ' \item{NOCDATA}{merge CDATA as text nodes}
43
- # ' \item{NOXINCNODE}{do not generate XINCLUDE START/END nodes}
44
- # ' \item{COMPACT}{compact small text nodes; no modification of the tree allowed afterwards (will possibly crash if you try to modify the tree)}
45
- # ' \item{OLD10}{parse using XML-1.0 before update 5}
46
- # ' \item{NOBASEFIX}{do not fixup XINCLUDE xml:base uris}
47
- # ' \item{HUGE}{relax any hardcoded limit from the parser}
48
- # ' \item{OLDSAX}{parse using SAX2 interface before 2.7.0}
49
- # ' \item{IGNORE_ENC}{ignore internal document encoding hint}
50
- # ' \item{BIG_LINES}{Store big lines numbers in text PSVI field}
51
- # ' }
25
+ # ' @param options Set parsing options for the libxml2 parser. Zero of more of
26
+ # ' \Sexpr[results=rd]{xml2:::describe_options(xml2:::xml_parse_options())}
52
27
# ' @return An XML document. HTML is normalised to valid XML - this may not
53
28
# ' be exactly the same transformation performed by the browser, but it's
54
29
# ' a reasonable approximation.
@@ -77,6 +52,8 @@ read_html <- function(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "
77
52
78
53
# ' @export
79
54
read_html.default <- function (x , encoding = " " , ... , options = c(" RECOVER" , " NOERROR" , " NOBLANKS" )) {
55
+ options <- parse_options(options , xml_parse_options())
56
+
80
57
suppressWarnings(read_xml(x , encoding = encoding , ... , as_html = TRUE , options = options ))
81
58
}
82
59
@@ -85,6 +62,7 @@ read_html.response <- function(x, encoding = "", options = c("RECOVER",
85
62
" NOERROR" , " NOBLANKS" ), ... ) {
86
63
need_package(" httr" )
87
64
65
+ options <- parse_options(options , xml_parse_options())
88
66
content <- httr :: content(x , as = " raw" )
89
67
xml2 :: read_html(content , encoding = encoding , options = options , ... )
90
68
}
@@ -94,7 +72,7 @@ read_html.response <- function(x, encoding = "", options = c("RECOVER",
94
72
read_xml.character <- function (x , encoding = " " , ... , as_html = FALSE ,
95
73
options = " NOBLANKS" ) {
96
74
97
- options <- parse_options(options )
75
+ options <- parse_options(options , xml_parse_options() )
98
76
if (grepl(" <|>" , x )) {
99
77
read_xml.raw(charToRaw(enc2utf8(x )), " UTF-8" , ... , as_html = as_html , options = options )
100
78
} else {
@@ -114,7 +92,7 @@ read_xml.character <- function(x, encoding = "", ..., as_html = FALSE,
114
92
# ' @rdname read_xml
115
93
read_xml.raw <- function (x , encoding = " " , base_url = " " , ... ,
116
94
as_html = FALSE , options = " NOBLANKS" ) {
117
- options <- parse_options(options )
95
+ options <- parse_options(options , xml_parse_options() )
118
96
119
97
doc <- doc_parse_raw(x , encoding = encoding , base_url = base_url ,
120
98
as_html = as_html , options = options )
@@ -126,6 +104,8 @@ read_xml.raw <- function(x, encoding = "", base_url = "", ...,
126
104
read_xml.connection <- function (x , encoding = " " , n = 64 * 1024 ,
127
105
verbose = FALSE , ... , base_url = " " ,
128
106
as_html = FALSE , options = " NOBLANKS" ) {
107
+ options <- parse_options(options , xml_parse_options())
108
+
129
109
if (! isOpen(x )) {
130
110
open(x , " rb" )
131
111
on.exit(close(x ))
@@ -141,48 +121,8 @@ read_xml.response <- function(x, encoding = "", base_url = "", ...,
141
121
as_html = FALSE , options = " NOBLANKS" ) {
142
122
need_package(" httr" )
143
123
124
+ options <- parse_options(options , xml_parse_options())
144
125
content <- httr :: content(x , as = " raw" )
145
126
xml2 :: read_xml(content , encoding = encoding , base_url = base_url ,
146
127
as_html = as_html , option = options , ... )
147
128
}
148
-
149
- `%<<%` <- function (a , n ) bitwShiftL(a , n )
150
-
151
- # http://xmlsoft.org/html/libxml-parser.html#xmlParserOption
152
- parser_options <- c(
153
- " RECOVER" = 1 %<< % 0 ,
154
- " NOENT" = 1 %<< % 1 ,
155
- " DTDLOAD" = 1 %<< % 2 ,
156
- " DTDATTR" = 1 %<< % 3 ,
157
- " DTDVALID" = 1 %<< % 4 ,
158
- " NOERROR" = 1 %<< % 5 ,
159
- " NOWARNING" = 1 %<< % 6 ,
160
- " PEDANTIC" = 1 %<< % 7 ,
161
- " NOBLANKS" = 1 %<< % 8 ,
162
- " SAX1" = 1 %<< % 9 ,
163
- " XINCLUDE" = 1 %<< % 10 ,
164
- " NONET" = 1 %<< % 11 ,
165
- " NODICT" = 1 %<< % 12 ,
166
- " NSCLEAN" = 1 %<< % 13 ,
167
- " NOCDATA" = 1 %<< % 14 ,
168
- " NOXINCNODE" = 1 %<< % 15 ,
169
- " COMPACT" = 1 %<< % 16 ,
170
- " OLD10" = 1 %<< % 17 ,
171
- " NOBASEFIX" = 1 %<< % 18 ,
172
- " HUGE" = 1 %<< % 19 ,
173
- " OLDSAX" = 1 %<< % 20 ,
174
- " OLDSAX" = 1 %<< % 20 ,
175
- " IGNORE_ENC" = 1 %<< % 21 ,
176
- " BIG_LINES" = 1 %<< % 22 )
177
-
178
- parse_options <- function (options ) {
179
- if (is.numeric(options )) {
180
- return (options )
181
- }
182
- mtch <- pmatch(options , names(parser_options ))
183
- if (any(is.na(mtch ))) {
184
- stop(" `options` " , options [is.na(mtch )][1L ], " is not a valid option" , call. = FALSE )
185
- }
186
-
187
- sum(parser_options [mtch ])
188
- }
0 commit comments