Skip to content

Commit dda5e6e

Browse files
committed
Define the behavior for invalid UTF-8 sequences
Fixes sergi#21
1 parent 5f9c862 commit dda5e6e

File tree

2 files changed

+14
-1
lines changed

2 files changed

+14
-1
lines changed

diffmatchpatch/diff.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,13 @@ func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff {
4545
}
4646

4747
// DiffMain finds the differences between two texts.
48+
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
4849
func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff {
4950
return dmp.DiffMainRunes([]rune(text1), []rune(text2), checklines)
5051
}
5152

5253
// DiffMainRunes finds the differences between two rune sequences.
54+
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
5355
func (dmp *DiffMatchPatch) DiffMainRunes(text1, text2 []rune, checklines bool) []Diff {
5456
var deadline time.Time
5557
if dmp.DiffTimeout > 0 {
@@ -209,6 +211,7 @@ func (dmp *DiffMatchPatch) diffLineMode(text1, text2 []rune, deadline time.Time)
209211
}
210212

211213
// DiffBisect finds the 'middle snake' of a diff, split the problem in two and return the recursively constructed diff.
214+
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
212215
// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
213216
func (dmp *DiffMatchPatch) DiffBisect(text1, text2 string, deadline time.Time) []Diff {
214217
// Unused in this code, but retained for interface compatibility.
@@ -353,7 +356,7 @@ func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string
353356
return string(chars1), string(chars2), lineArray
354357
}
355358

356-
// DiffLinesToRunes splits two texts into a list of runes. Each rune represents one line.
359+
// DiffLinesToRunes splits two texts into a list of runes. Each rune represents one line.
357360
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
358361
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
359362
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'

diffmatchpatch/diff_test.go

+10
Original file line numberDiff line numberDiff line change
@@ -1167,6 +1167,11 @@ func TestDiffBisect(t *testing.T) {
11671167
actual := dmp.DiffBisect("cat", "map", tc.Time)
11681168
assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %s", i, tc.Name))
11691169
}
1170+
1171+
// Test for invalid UTF-8 sequences
1172+
assert.Equal(t, []Diff{
1173+
Diff{DiffEqual, "��"},
1174+
}, dmp.DiffBisect("\xe0\xe5", "\xe0\xe5", time.Now().Add(time.Minute)))
11701175
}
11711176

11721177
func TestDiffMain(t *testing.T) {
@@ -1297,6 +1302,11 @@ func TestDiffMain(t *testing.T) {
12971302
actual := dmp.DiffMain(tc.Text1, tc.Text2, false)
12981303
assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc))
12991304
}
1305+
1306+
// Test for invalid UTF-8 sequences
1307+
assert.Equal(t, []Diff{
1308+
Diff{DiffDelete, "��"},
1309+
}, dmp.DiffMain("\xe0\xe5", "", false))
13001310
}
13011311

13021312
func TestDiffMainWithTimeout(t *testing.T) {

0 commit comments

Comments
 (0)