-
Notifications
You must be signed in to change notification settings - Fork 83
/
Copy pathxml2_doc.cpp
261 lines (238 loc) · 6.08 KB
/
xml2_doc.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#include <Rcpp.h>
using namespace Rcpp;
#include <libxml/parser.h>
#include <libxml/HTMLparser.h>
#include "xml2_types.h"
#include "xml2_utils.h"
// [[Rcpp::export]]
Rcpp::IntegerVector xml_parse_options() {
#if defined(LIBXML_VERSION) && (LIBXML_VERSION >= 20700)
#define HAS_OLD10
#define HAS_NOBASEFIX
#define HAS_HUGE
#define HAS_OLDSAX
#endif
/* * *
* Author: Daniel Veillard <[email protected]>
* Date: Mon May 16 16:03:50 2011 +0800
* https://github.com/GNOME/libxml2/commit/c62efc847c836d4c4f1aea08c68cd93bd342b9f4
*
* Add options to ignore the internal encoding
*/
#if defined(LIBXML_VERSION) && (LIBXML_VERSION >= 20800)
#define HAS_IGNORE_ENC
#endif
/* * *
* Author: Daniel Veillard <[email protected]>
* Date: Mon Aug 13 12:41:33 2012 +0800
* https://github.com/GNOME/libxml2/commit/968a03a2e54f5bcf53089f5e3c8f790dbe0bf824
*
* Add support for big line numbers in error reporting
*/
#if defined(LIBXML_VERSION) && (LIBXML_VERSION >= 20900)
#define HAS_BIG_LINES
#endif
const char * names[] = {
"RECOVER",
"NOENT",
"DTDLOAD",
"DTDATTR",
"DTDVALID",
"NOERROR",
"NOWARNING",
"PEDANTIC",
"NOBLANKS",
"SAX1",
"XINCLUDE",
"NONET",
"NODICT",
"NSCLEAN",
"NOCDATA",
"NOXINCNODE",
"COMPACT",
#ifdef HAS_OLD10
"OLD10",
#endif
#ifdef HAS_NOBASEFIX
"NOBASEFIX",
#endif
#ifdef HAS_HUGE
"HUGE",
#endif
#ifdef HAS_OLDSAX
"OLDSAX",
#endif
#ifdef HAS_IGNORE_ENC
"IGNORE_ENC",
#endif
#ifdef HAS_BIG_LINES
"BIG_LINES",
#endif
};
const int values[] = {
XML_PARSE_RECOVER,
XML_PARSE_NOENT,
XML_PARSE_DTDLOAD,
XML_PARSE_DTDATTR,
XML_PARSE_DTDVALID,
XML_PARSE_NOERROR,
XML_PARSE_NOWARNING,
XML_PARSE_PEDANTIC,
XML_PARSE_NOBLANKS,
XML_PARSE_SAX1,
XML_PARSE_XINCLUDE,
XML_PARSE_NONET,
XML_PARSE_NODICT,
XML_PARSE_NSCLEAN,
XML_PARSE_NOCDATA,
XML_PARSE_NOXINCNODE,
XML_PARSE_COMPACT,
#ifdef HAS_OLD10
XML_PARSE_OLD10,
#endif
#ifdef HAS_NOBASEFIX
XML_PARSE_NOBASEFIX,
#endif
#ifdef HAS_HUGE
XML_PARSE_HUGE,
#endif
#ifdef HAS_OLDSAX
XML_PARSE_OLDSAX,
#endif
#ifdef HAS_IGNORE_ENC
XML_PARSE_IGNORE_ENC,
#endif
#ifdef HAS_BIG_LINES
XML_PARSE_BIG_LINES,
#endif
};
const char * descriptions[] = {
"recover on errors",
"substitute entities",
"load the external subset",
"default DTD attributes",
"validate with the DTD",
"suppress error reports",
"suppress warning reports",
"pedantic error reporting",
"remove blank nodes",
"use the SAX1 interface internally",
"Implement XInclude substitition",
"Forbid network access",
"Do not reuse the context dictionary",
"remove redundant namespaces declarations",
"merge CDATA as text nodes",
"do not generate XINCLUDE START/END nodes",
"compact small text nodes; no modification of the tree allowed afterwards (will possibly crash if you try to modify the tree)",
#ifdef HAS_OLD10
"parse using XML-1.0 before update 5",
#endif
#ifdef HAS_NOBASEFIX
"do not fixup XINCLUDE xml:base uris",
#endif
#ifdef HAS_HUGE
"relax any hardcoded limit from the parser",
#endif
#ifdef HAS_OLDSAX
"parse using SAX2 interface before 2.7.0",
#endif
#ifdef HAS_IGNORE_ENC
"ignore internal document encoding hint",
#endif
#ifdef HAS_BIG_LINES
"Store big lines numbers in text PSVI field",
#endif
};
size_t size = sizeof(values) / sizeof(values[0]);
Rcpp::IntegerVector out_values = Rcpp::IntegerVector(size);
Rcpp::CharacterVector out_names = Rcpp::CharacterVector(size);
Rcpp::CharacterVector out_descriptions = Rcpp::CharacterVector(size);
for (size_t i = 0; i < size; ++i) {
out_values[i] = values[i];
out_names[i] = names[i];
out_descriptions[i] = descriptions[i];
}
out_values.attr("names") = out_names;
out_values.attr("descriptions") = out_descriptions;
return out_values;
#undef HAS_OLD10
#undef HAS_NOBASEFIX
#undef HAS_HUGE
#undef HAS_OLDSAX
#undef HAS_BIG_LINES
#undef HAS_IGNORE_ENC
}
// [[Rcpp::export]]
XPtrDoc doc_parse_file(std::string path,
std::string encoding = "",
bool as_html = false,
int options = 0) {
xmlDoc* pDoc;
if (as_html) {
pDoc = htmlReadFile(
path.c_str(),
encoding == "" ? NULL : encoding.c_str(),
options
);
} else {
pDoc = xmlReadFile(
path.c_str(),
encoding == "" ? NULL : encoding.c_str(),
options
);
}
if (pDoc == NULL)
Rcpp::stop("Failed to parse %s", path);
return XPtrDoc(pDoc);
}
// [[Rcpp::export]]
XPtrDoc doc_parse_raw(RawVector x, std::string encoding,
std::string base_url = "",
bool as_html = false,
int options = 0) {
xmlDoc* pDoc;
if (as_html) {
pDoc = htmlReadMemory(
(const char *) RAW(x),
Rf_length(x),
base_url == "" ? NULL : base_url.c_str(),
encoding == "" ? NULL : encoding.c_str(),
options
);
} else {
pDoc = xmlReadMemory(
(const char *) RAW(x),
Rf_length(x),
base_url == "" ? NULL : base_url.c_str(),
encoding == "" ? NULL : encoding.c_str(),
options
);
}
if (pDoc == NULL)
Rcpp::stop("Failed to parse text");
return XPtrDoc(pDoc);
}
// [[Rcpp::export]]
XPtrNode doc_root(XPtrDoc x) {
return XPtrNode(xmlDocGetRootElement(x.checked_get()));
}
// [[Rcpp::export]]
bool doc_has_root(XPtrDoc x) {
return xmlDocGetRootElement(x.get()) != NULL;
}
// [[Rcpp::export]]
CharacterVector doc_url(XPtrDoc x) {
SEXP string = (x->URL == NULL) ? NA_STRING : Rf_mkCharCE((const char*) x->URL, CE_UTF8);
return CharacterVector(string);
}
// [[Rcpp::export]]
XPtrDoc doc_new(std::string version, std::string encoding = "UTF-8") {
XPtrDoc x = XPtrDoc(xmlNewDoc(asXmlChar(version)));
xmlCharEncodingHandlerPtr p = xmlFindCharEncodingHandler(encoding.c_str());
x->encoding = xmlStrdup(reinterpret_cast<const xmlChar *>(p->name));
return x;
}
// [[Rcpp::export]]
XPtrNode doc_set_root(XPtrDoc doc, XPtrNode root) {
return XPtrNode(xmlDocSetRootElement(doc, root));
}