forked from joshdk/go-junit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.go
120 lines (107 loc) · 3.91 KB
/
parse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// Copyright Josh Komoroske. All rights reserved.
// Use of this source code is governed by the MIT license,
// a copy of which can be found in the LICENSE.txt file.
// SPDX-License-Identifier: MIT
package junit
import (
"bytes"
"encoding/xml"
"errors"
"html"
"io"
)
// reparentXML will wrap the given reader (which is assumed to be valid XML),
// in a fake root nodeAlias.
//
// This action is useful in the event that the original XML document does not
// have a single root nodeAlias, which is required by the XML specification.
// Additionally, Go's XML parser will silently drop all nodes after the first
// that is encountered, which can lead to data loss from a parser perspective.
// This function also enables the ingestion of blank XML files, which would
// normally cause a parsing error.
func reparentXML(reader io.Reader) io.Reader {
return io.MultiReader(
bytes.NewReader([]byte("<fake-root>")),
reader,
bytes.NewReader([]byte("</fake-root>")),
)
}
// extractContent parses the raw contents from an XML node, and returns it in a
// more consumable form.
//
// This function deals with two distinct classes of node data; Encoded entities
// and CDATA tags. These Encoded entities are normal (html escaped) text that
// you typically find between tags like so:
//
// "Hello, world!" → "Hello, world!"
// "I </3 XML" → "I </3 XML"
//
// CDATA tags are a special way to embed data that would normally require
// escaping, without escaping it, like so:
//
// "<![CDATA[Hello, world!]]>" → "Hello, world!"
// "<![CDATA[I </3 XML]]>" → "I </3 XML"
// "<![CDATA[I </3 XML]]>" → "I </3 XML"
//
// This function specifically allows multiple interleaved instances of either
// encoded entities or cdata, and will decode them into one piece of normalized
// text, like so:
//
// "I </3 XML <![CDATA[a lot]]>. You probably <![CDATA[</3 XML]]> too." → "I </3 XML a lot. You probably </3 XML too."
// └─────┬─────┘ └─┬─┘ └──────┬──────┘ └──┬──┘ └─┬─┘
// "I </3 XML " "a lot" ". You probably " "</3 XML" " too."
//
// Errors are returned only when there are unmatched CDATA tags, although these
// should cause proper XML unmarshalling errors first, if encountered in an
// actual XML document.
func extractContent(data []byte) ([]byte, error) {
var (
cdataStart = []byte("<![CDATA[")
cdataEnd = []byte("]]>")
mode int
output []byte
)
for {
if mode == 0 { //nolint:nestif
offset := bytes.Index(data, cdataStart)
if offset == -1 {
// The string "<![CDATA[" does not appear in the data. Unescape all remaining data and finish
if bytes.Contains(data, cdataEnd) {
// The string "]]>" appears in the data. This is an error!
return nil, errors.New("unmatched CDATA end tag")
}
output = append(output, html.UnescapeString(string(data))...)
break
}
// The string "<![CDATA[" appears at some offset. Unescape up to that offset. Discard "<![CDATA[" prefix.
output = append(output, html.UnescapeString(string(data[:offset]))...)
data = data[offset:]
data = data[9:]
mode = 1
} else if mode == 1 {
offset := bytes.Index(data, cdataEnd)
if offset == -1 {
// The string "]]>" does not appear in the data. This is an error!
return nil, errors.New("unmatched CDATA start tag")
}
// The string "]]>" appears at some offset. Read up to that offset. Discard "]]>" prefix.
output = append(output, data[:offset]...)
data = data[offset:]
data = data[3:]
mode = 0
}
}
return output, nil
}
// parse unmarshalls the given XML data into a graph of nodes, and then returns
// a slice of all top-level nodes.
func parse(reader io.Reader) ([]xmlNode, error) {
var (
dec = xml.NewDecoder(reparentXML(reader))
root xmlNode
)
if err := dec.Decode(&root); err != nil {
return nil, err
}
return root.Nodes, nil
}