Skip to content

Commit db1b095

Browse files
r-paisergi
authored andcommitted
Index out of range panic in DiffCharsToLines on large JSON diff
1 parent df97e07 commit db1b095

File tree

3 files changed

+100121
-63
lines changed

3 files changed

+100121
-63
lines changed

diffmatchpatch/diff.go

Lines changed: 73 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ const (
3434
DiffInsert Operation = 1
3535
// DiffEqual item represents an equal diff.
3636
DiffEqual Operation = 0
37+
//IndexSeperator is used to seperate the array indexes in an index string
38+
IndexSeperator = ","
3739
)
3840

3941
// Diff represents one diff operation
@@ -396,65 +398,17 @@ func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string
396398

397399
// DiffLinesToRunes splits two texts into a list of runes. Each rune represents one line.
398400
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
399-
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
400-
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'
401-
lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4
402-
403-
chars1 := dmp.diffLinesToRunesMunge(text1, &lineArray, lineHash)
404-
chars2 := dmp.diffLinesToRunesMunge(text2, &lineArray, lineHash)
405-
406-
return chars1, chars2, lineArray
401+
chars1, chars2, lineArray := dmp.DiffLinesToStrings(text1, text2)
402+
return []rune(chars1), []rune(chars2), lineArray
407403
}
408404

409405
func (dmp *DiffMatchPatch) diffLinesToRunes(text1, text2 []rune) ([]rune, []rune, []string) {
410406
return dmp.DiffLinesToRunes(string(text1), string(text2))
411407
}
412408

413-
// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a []rune where each Unicode character represents one line.
414-
// We use strings instead of []runes as input mainly because you can't use []rune as a map key.
415-
func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineArray *[]string, lineHash map[string]int) []rune {
416-
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
417-
lineStart := 0
418-
lineEnd := -1
419-
runes := []rune{}
420-
421-
for lineEnd < len(text)-1 {
422-
lineEnd = indexOf(text, "\n", lineStart)
423-
424-
if lineEnd == -1 {
425-
lineEnd = len(text) - 1
426-
}
427-
428-
line := text[lineStart : lineEnd+1]
429-
lineStart = lineEnd + 1
430-
lineValue, ok := lineHash[line]
431-
432-
if ok {
433-
runes = append(runes, rune(lineValue))
434-
} else {
435-
*lineArray = append(*lineArray, line)
436-
lineHash[line] = len(*lineArray) - 1
437-
runes = append(runes, rune(len(*lineArray)-1))
438-
}
439-
}
440-
441-
return runes
442-
}
443-
444409
// DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
445410
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
446-
hydrated := make([]Diff, 0, len(diffs))
447-
for _, aDiff := range diffs {
448-
chars := aDiff.Text
449-
text := make([]string, len(chars))
450-
451-
for i, r := range chars {
452-
text[i] = lineArray[r]
453-
}
454-
455-
aDiff.Text = strings.Join(text, "")
456-
hydrated = append(hydrated, aDiff)
457-
}
411+
hydrated := dmp.DiffStringsToLines(diffs, lineArray)
458412
return hydrated
459413
}
460414

@@ -1343,3 +1297,71 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di
13431297

13441298
return diffs, nil
13451299
}
1300+
1301+
// DiffLinesToStrings splits two texts into a list of strings. Each string represents one line.
1302+
func (dmp *DiffMatchPatch) DiffLinesToStrings(text1, text2 string) (string, string, []string) {
1303+
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
1304+
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'
1305+
1306+
//Each string has the index of lineArray which it points to
1307+
strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray)
1308+
strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray)
1309+
1310+
//Adding a delimter to later get the strings as array
1311+
str1 := strings.Join(strIndexArray1[:], IndexSeperator)
1312+
str2 := strings.Join(strIndexArray2[:], IndexSeperator)
1313+
1314+
return str1, str2, lineArray
1315+
}
1316+
1317+
// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []rune where each Unicode character represents one line.
1318+
// We use strings instead of []runes as input mainly because you can't use []rune as a map key.
1319+
func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []string {
1320+
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
1321+
lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4
1322+
lineStart := 0
1323+
lineEnd := -1
1324+
strings := []string{}
1325+
1326+
for lineEnd < len(text)-1 {
1327+
lineEnd = indexOf(text, "\n", lineStart)
1328+
1329+
if lineEnd == -1 {
1330+
lineEnd = len(text) - 1
1331+
}
1332+
1333+
line := text[lineStart : lineEnd+1]
1334+
lineStart = lineEnd + 1
1335+
lineValue, ok := lineHash[line]
1336+
1337+
if ok {
1338+
strings = append(strings, strconv.Itoa(lineValue))
1339+
} else {
1340+
*lineArray = append(*lineArray, line)
1341+
lineHash[line] = len(*lineArray) - 1
1342+
strings = append(strings, strconv.Itoa(len(*lineArray)-1))
1343+
}
1344+
}
1345+
1346+
return strings
1347+
}
1348+
1349+
// DiffStringsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
1350+
func (dmp *DiffMatchPatch) DiffStringsToLines(diffs []Diff, lineArray []string) []Diff {
1351+
hydrated := make([]Diff, 0, len(diffs))
1352+
for _, aDiff := range diffs {
1353+
chars := strings.Split(aDiff.Text, IndexSeperator)
1354+
text := make([]string, len(chars))
1355+
1356+
for i, r := range chars {
1357+
i1, err := strconv.Atoi(r)
1358+
if err == nil {
1359+
text[i] = lineArray[i1]
1360+
}
1361+
}
1362+
1363+
aDiff.Text = strings.Join(text, "")
1364+
hydrated = append(hydrated, aDiff)
1365+
}
1366+
return hydrated
1367+
}

diffmatchpatch/diff_test.go

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ package diffmatchpatch
1111
import (
1212
"bytes"
1313
"fmt"
14+
"io/ioutil"
15+
"os"
1416
"strconv"
1517
"strings"
1618
"testing"
@@ -312,10 +314,10 @@ func TestDiffLinesToChars(t *testing.T) {
312314
dmp := New()
313315

314316
for i, tc := range []TestCase{
315-
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\u0001\u0002\u0003\u0003", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
316-
{"a", "b", "\u0001", "\u0002", []string{"", "a", "b"}},
317+
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
318+
{"a", "b", "1", "2", []string{"", "a", "b"}},
317319
// Omit final newline.
318-
{"alpha\nbeta\nalpha", "", "\u0001\u0002\u0003", "", []string{"", "alpha\n", "beta\n", "alpha"}},
320+
{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
319321
} {
320322
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
321323
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
@@ -328,14 +330,14 @@ func TestDiffLinesToChars(t *testing.T) {
328330
lineList := []string{
329331
"", // Account for the initial empty element of the lines array.
330332
}
331-
var charList []rune
333+
var charList []string
332334
for x := 1; x < n+1; x++ {
333335
lineList = append(lineList, strconv.Itoa(x)+"\n")
334-
charList = append(charList, rune(x))
336+
charList = append(charList, strconv.Itoa(x))
335337
}
336338
lines := strings.Join(lineList, "")
337-
chars := string(charList)
338-
assert.Equal(t, n, utf8.RuneCountInString(chars))
339+
chars := strings.Join(charList[:], ",")
340+
assert.Equal(t, n, len(strings.Split(chars, ",")))
339341

340342
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
341343
assert.Equal(t, chars, actualChars1)
@@ -356,8 +358,8 @@ func TestDiffCharsToLines(t *testing.T) {
356358
for i, tc := range []TestCase{
357359
{
358360
Diffs: []Diff{
359-
{DiffEqual, "\u0001\u0002\u0001"},
360-
{DiffInsert, "\u0002\u0001\u0002"},
361+
{DiffEqual, "1,2,1"},
362+
{DiffInsert, "2,1,2"},
361363
},
362364
Lines: []string{"", "alpha\n", "beta\n"},
363365

@@ -376,14 +378,15 @@ func TestDiffCharsToLines(t *testing.T) {
376378
lineList := []string{
377379
"", // Account for the initial empty element of the lines array.
378380
}
379-
charList := []rune{}
381+
charList := []string{}
380382
for x := 1; x <= n; x++ {
381383
lineList = append(lineList, strconv.Itoa(x)+"\n")
382-
charList = append(charList, rune(x))
384+
charList = append(charList, strconv.Itoa(x))
383385
}
384386
assert.Equal(t, n, len(charList))
387+
chars := strings.Join(charList[:], ",")
385388

386-
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, string(charList)}}, lineList)
389+
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
387390
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
388391
}
389392

@@ -1496,3 +1499,35 @@ func BenchmarkDiffMainRunesLargeLines(b *testing.B) {
14961499
diffs = dmp.DiffCharsToLines(diffs, linearray)
14971500
}
14981501
}
1502+
1503+
func BenchmarkDiffMainStringsLargeLines(b *testing.B) {
1504+
s1, s2 := speedtestTexts()
1505+
1506+
dmp := New()
1507+
1508+
b.ResetTimer()
1509+
1510+
for i := 0; i < b.N; i++ {
1511+
text1, text2, linearray := dmp.DiffLinesToStrings(s1, s2)
1512+
1513+
diffs := dmp.DiffMain(text1, text2, false)
1514+
diffs = dmp.DiffStringsToLines(diffs, linearray)
1515+
}
1516+
}
1517+
1518+
func BenchmarkDiffMainRunesLargeDiffLines(b *testing.B) {
1519+
fp, _ := os.Open("../testdata/diff10klinestest.txt")
1520+
defer fp.Close()
1521+
data, _ := ioutil.ReadAll(fp)
1522+
1523+
dmp := New()
1524+
1525+
b.ResetTimer()
1526+
1527+
for i := 0; i < b.N; i++ {
1528+
text1, text2, linearray := dmp.DiffLinesToRunes(string(data), "")
1529+
1530+
diffs := dmp.DiffMainRunes(text1, text2, false)
1531+
diffs = dmp.DiffCharsToLines(diffs, linearray)
1532+
}
1533+
}

0 commit comments

Comments
 (0)