Skip to content

Commit

Permalink
handle arbitrarily nested XML tags with the same name
Browse files Browse the repository at this point in the history
  • Loading branch information
miku committed Apr 21, 2024
1 parent 27272f3 commit e3d36d3
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 64 deletions.
85 changes: 54 additions & 31 deletions record/split.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package record
import (
"bytes"
"errors"
"index/suffixarray"
"io"
"sort"
"sync"
)

Expand Down Expand Up @@ -160,50 +162,71 @@ func (s *TagSplitter) Split(data []byte, atEOF bool) (advance int, token []byte,
// copyContent reads at most one element content from the internal buffer and
// writes it to the given writer. Returns the number of bytes read, e.g. zero
// if no complete element has been found in the internal buffer. This may fail
// on invalid XML, if a single element size exceeds a limit or as a current
// restriction, if data contains nested tags of the same name.
// on invalid XML or very large XML elements.
func (s *TagSplitter) copyContent(w io.Writer) (n int, err error) {
if len(s.buf) > maxBufSize {
return 0, ErrMaxBufSizeExceeded
}
var start, end, last int
if start = s.indexOpeningTag(s.buf); start == -1 {
index := suffixarray.New(s.buf)
// We can treat both tags the same, as they have the same length,
// accidentally.
ot1 := index.Lookup(s.openingTag1, -1)
ot2 := index.Lookup(s.openingTag2, -1)
openingTagIndices := append(ot1, ot2...)
if len(openingTagIndices) == 0 {
return 0, errOpenTagNotFound
}
if end = s.indexClosingTag(s.buf); end == -1 {
closingTagIndices := index.Lookup(s.closingTag, -1)
if len(closingTagIndices) == 0 {
return 0, nil
}
if end < start {
return 0, ErrGarbledInput
}
last = end + len(s.Tag) + 3
if s.indexOpeningTag(s.buf[start+1:end]) != -1 {
return 0, ErrNestedTagsNotImplemented
var start, end, last int
if len(openingTagIndices) == 1 && len(closingTagIndices) == 1 {
start = openingTagIndices[0]
end = closingTagIndices[0]
if end < start {
return 0, ErrGarbledInput
}
last = end + len(s.Tag) + 3 // TODO: assumes </...>
} else {
sort.Ints(openingTagIndices)
sort.Ints(closingTagIndices)
start, end = findMatchingTags(openingTagIndices, closingTagIndices)
if end < start {
return 0, ErrGarbledInput
}
if start == -1 {
// no matching tag found
return 0, nil
}
last = end + len(s.Tag) + 3 // TODO: assumes </...>
}
n, err = w.Write(s.buf[start:last])
s.buf = s.buf[last:] // TODO: optimize this, ringbuffer?
return
}

// indexOpeningTag returns the index of the first opening tag in data, or -1;
// cf. https://www.w3.org/TR/REC-xml/#sec-starttags
func (s *TagSplitter) indexOpeningTag(data []byte) int {
// TODO: this seems to be a bigger bottleneck
// (https://i.imgur.com/fYzN2mq.png) that I originally thought. Average
// size of data is about 3K.
u := bytes.Index(data, s.openingTag1)
v := bytes.Index(data, s.openingTag2)
if u == -1 && v == -1 {
return -1
}
if v == -1 || u < v {
return u
} else {
return v
// findMatchingTags returns the indices of matching opening and close tags. The
// opening tag used is always the first one. Returns [-1, -1] if no matching
// closing tag exists.
func findMatchingTags(opening []int, closing []int) (int, int) {
if len(opening) == 0 || len(closing) == 0 {
return -1, -1
}
var i, j, size int
for {
if j == len(closing) {
return -1, -1
}
if i < len(opening) && opening[i] < closing[j] {
size++
i++
} else {
size--
if size == 0 {
return opening[0], closing[j]
}
j++
}
}
}

// indexClosingTag returns the index of the first closing tag in data or -1.
func (s *TagSplitter) indexClosingTag(data []byte) int {
return bytes.Index(data, s.closingTag)
}
55 changes: 22 additions & 33 deletions record/split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,39 +117,28 @@ func TestSplit(t *testing.T) {
}
}

func TestIndexXTag(t *testing.T) {
var cases = []struct {
s *TagSplitter
data []byte
openingTagIndex int
closingTagIndex int
}{
{
s: &TagSplitter{
Tag: "a",
},
data: []byte("none"),
openingTagIndex: -1,
closingTagIndex: -1,
},
{
s: &TagSplitter{
Tag: "a",
},
data: []byte("hello <a> world </a>"),
openingTagIndex: 6,
closingTagIndex: 16,
},
}
for _, c := range cases {
c.s.ensureTags() // use private, leak private
ot := c.s.indexOpeningTag(c.data)
if ot != c.openingTagIndex {
t.Fatalf("got %v, want %v", ot, c.openingTagIndex)
}
ct := c.s.indexClosingTag(c.data)
if ct != c.closingTagIndex {
t.Fatalf("got %v, want %v", ct, c.closingTagIndex)
func BenchmarkTagSplitter(b *testing.B) {
data := `
....................<a>................
.......................................
..............<a></a>..................
.......................................
.......................................
.......................................
<a>...</a>...............<a>....</a>...
.......................................
.......................................
.......................................
...................................</a>
`
ts := TagSplitter{Tag: "a", MaxBytesApprox: 8}
s := bufio.NewScanner(strings.NewReader(data))
s.Split(ts.Split)
for n := 0; n < b.N; n++ {
var count int
for s.Scan() {
_ = s.Text()
count++
}
}
}

0 comments on commit e3d36d3

Please sign in to comment.