Skip to content

Commit 2918fce

Browse files
committed
Ignore XML declaration in xpath_internal(), for UTF8 databases.
When a value contained an XML declaration naming some other encoding, this function interpreted UTF8 bytes as the named encoding, yielding mojibake. xml_parse() already has similar logic. This would be necessary but not sufficient for non-UTF8 databases, so preserve behavior there until the xpath facility can support such databases comprehensively. Back-patch to 9.3 (all supported versions). Pavel Stehule and Noah Misch Discussion: https://postgr.es/m/CAFj8pRC-dM=tT=QkGi+Achkm+gwPmjyOayGuUfXVumCxkDgYWg@mail.gmail.com
1 parent 5edc63b commit 2918fce

File tree

5 files changed

+142
-1
lines changed

5 files changed

+142
-1
lines changed

src/backend/utils/adt/xml.c

+13-1
Original file line numberDiff line numberDiff line change
@@ -3845,6 +3845,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
38453845
int32 xpath_len;
38463846
xmlChar *string;
38473847
xmlChar *xpath_expr;
3848+
size_t xmldecl_len = 0;
38483849
int i;
38493850
int ndim;
38503851
Datum *ns_names_uris;
@@ -3900,6 +3901,16 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
39003901
string = pg_xmlCharStrndup(datastr, len);
39013902
xpath_expr = pg_xmlCharStrndup(VARDATA_ANY(xpath_expr_text), xpath_len);
39023903

3904+
/*
3905+
* In a UTF8 database, skip any xml declaration, which might assert
3906+
* another encoding. Ignore parse_xml_decl() failure, letting
3907+
* xmlCtxtReadMemory() report parse errors. Documentation disclaims
3908+
* xpath() support for non-ASCII data in non-UTF8 databases, so leave
3909+
* those scenarios bug-compatible with historical behavior.
3910+
*/
3911+
if (GetDatabaseEncoding() == PG_UTF8)
3912+
parse_xml_decl(string, &xmldecl_len, NULL, NULL, NULL);
3913+
39033914
xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
39043915

39053916
PG_TRY();
@@ -3914,7 +3925,8 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
39143925
if (ctxt == NULL || xmlerrcxt->err_occurred)
39153926
xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
39163927
"could not allocate parser context");
3917-
doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
3928+
doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len,
3929+
len - xmldecl_len, NULL, NULL, 0);
39183930
if (doc == NULL || xmlerrcxt->err_occurred)
39193931
xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
39203932
"could not parse XML document");

src/test/regress/expected/xml.out

+31
Original file line numberDiff line numberDiff line change
@@ -670,6 +670,37 @@ SELECT xpath('/nosuchtag', '<root/>');
670670
{}
671671
(1 row)
672672

673+
-- Round-trip non-ASCII data through xpath().
674+
DO $$
675+
DECLARE
676+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
677+
degree_symbol text;
678+
res xml[];
679+
BEGIN
680+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
681+
-- the server encoding is not UTF8. The EXCEPTION block below,
682+
-- currently dead code, will be relevant if we remove this limitation.
683+
IF current_setting('server_encoding') <> 'UTF8' THEN
684+
RAISE LOG 'skip: encoding % unsupported for xml',
685+
current_setting('server_encoding');
686+
RETURN;
687+
END IF;
688+
689+
degree_symbol := convert_from('\xc2b0', 'UTF8');
690+
res := xpath('text()', (xml_declaration ||
691+
'<x>' || degree_symbol || '</x>')::xml);
692+
IF degree_symbol <> res[1]::text THEN
693+
RAISE 'expected % (%), got % (%)',
694+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
695+
res[1], convert_to(res[1]::text, 'UTF8');
696+
END IF;
697+
EXCEPTION
698+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
699+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
700+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
701+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
702+
END
703+
$$;
673704
-- Test xmlexists and xpath_exists
674705
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
675706
xmlexists

src/test/regress/expected/xml_1.out

+35
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,41 @@ LINE 1: SELECT xpath('/nosuchtag', '<root/>');
576576
^
577577
DETAIL: This functionality requires the server to be built with libxml support.
578578
HINT: You need to rebuild PostgreSQL using --with-libxml.
579+
-- Round-trip non-ASCII data through xpath().
580+
DO $$
581+
DECLARE
582+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
583+
degree_symbol text;
584+
res xml[];
585+
BEGIN
586+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
587+
-- the server encoding is not UTF8. The EXCEPTION block below,
588+
-- currently dead code, will be relevant if we remove this limitation.
589+
IF current_setting('server_encoding') <> 'UTF8' THEN
590+
RAISE LOG 'skip: encoding % unsupported for xml',
591+
current_setting('server_encoding');
592+
RETURN;
593+
END IF;
594+
595+
degree_symbol := convert_from('\xc2b0', 'UTF8');
596+
res := xpath('text()', (xml_declaration ||
597+
'<x>' || degree_symbol || '</x>')::xml);
598+
IF degree_symbol <> res[1]::text THEN
599+
RAISE 'expected % (%), got % (%)',
600+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
601+
res[1], convert_to(res[1]::text, 'UTF8');
602+
END IF;
603+
EXCEPTION
604+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
605+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
606+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
607+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
608+
END
609+
$$;
610+
ERROR: unsupported XML feature
611+
DETAIL: This functionality requires the server to be built with libxml support.
612+
HINT: You need to rebuild PostgreSQL using --with-libxml.
613+
CONTEXT: PL/pgSQL function inline_code_block line 17 at assignment
579614
-- Test xmlexists and xpath_exists
580615
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
581616
ERROR: unsupported XML feature

src/test/regress/expected/xml_2.out

+31
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,37 @@ SELECT xpath('/nosuchtag', '<root/>');
650650
{}
651651
(1 row)
652652

653+
-- Round-trip non-ASCII data through xpath().
654+
DO $$
655+
DECLARE
656+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
657+
degree_symbol text;
658+
res xml[];
659+
BEGIN
660+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
661+
-- the server encoding is not UTF8. The EXCEPTION block below,
662+
-- currently dead code, will be relevant if we remove this limitation.
663+
IF current_setting('server_encoding') <> 'UTF8' THEN
664+
RAISE LOG 'skip: encoding % unsupported for xml',
665+
current_setting('server_encoding');
666+
RETURN;
667+
END IF;
668+
669+
degree_symbol := convert_from('\xc2b0', 'UTF8');
670+
res := xpath('text()', (xml_declaration ||
671+
'<x>' || degree_symbol || '</x>')::xml);
672+
IF degree_symbol <> res[1]::text THEN
673+
RAISE 'expected % (%), got % (%)',
674+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
675+
res[1], convert_to(res[1]::text, 'UTF8');
676+
END IF;
677+
EXCEPTION
678+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
679+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
680+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
681+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
682+
END
683+
$$;
653684
-- Test xmlexists and xpath_exists
654685
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
655686
xmlexists

src/test/regress/sql/xml.sql

+32
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,38 @@ SELECT xpath('count(//*)=3', '<root><sub/><sub/></root>');
189189
SELECT xpath('name(/*)', '<root><sub/><sub/></root>');
190190
SELECT xpath('/nosuchtag', '<root/>');
191191

192+
-- Round-trip non-ASCII data through xpath().
193+
DO $$
194+
DECLARE
195+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
196+
degree_symbol text;
197+
res xml[];
198+
BEGIN
199+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
200+
-- the server encoding is not UTF8. The EXCEPTION block below,
201+
-- currently dead code, will be relevant if we remove this limitation.
202+
IF current_setting('server_encoding') <> 'UTF8' THEN
203+
RAISE LOG 'skip: encoding % unsupported for xml',
204+
current_setting('server_encoding');
205+
RETURN;
206+
END IF;
207+
208+
degree_symbol := convert_from('\xc2b0', 'UTF8');
209+
res := xpath('text()', (xml_declaration ||
210+
'<x>' || degree_symbol || '</x>')::xml);
211+
IF degree_symbol <> res[1]::text THEN
212+
RAISE 'expected % (%), got % (%)',
213+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
214+
res[1], convert_to(res[1]::text, 'UTF8');
215+
END IF;
216+
EXCEPTION
217+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
218+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
219+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
220+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
221+
END
222+
$$;
223+
192224
-- Test xmlexists and xpath_exists
193225
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
194226
SELECT xmlexists('//town[text() = ''Cwmbran'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');

0 commit comments

Comments
 (0)