Skip to content

Commit

Permalink
u8decode: detect invalid encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
leahneukirchen committed Nov 23, 2017
1 parent 7364136 commit 3e6f804
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions u8decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

// Decode one UTF-8 codepoint into cp, return number of bytes to next one.
// On invalid UTF-8, return -1, and do not change cp.
// Overlong sequences, surrogates and invalid codepoints are not checked.
// Invalid codepoints are not checked.
//
// This code is meant to be inlined, if cp is unused it can be optimized away.
static int
Expand All @@ -12,10 +12,18 @@ u8decode(const char *cs, uint32_t *cp)

if (*s == 0) { *cp = 0; return 0; }
if (*s < 0x80) { *cp = *s; return 1; }
if (*s < 0xc0) { return -1; }
if (*s < 0xc2) { return -1; } //cont+overlong
if (*s < 0xe0) { *cp = *s & 0x1f; goto u2; }
if (*s < 0xf0) { *cp = *s & 0x0f; goto u3; }
if (*s < 0xf8) { *cp = *s & 0x07; goto u4; }
if (*s < 0xf0) {
if (*s == 0xe0 && (s[1] & 0xe0) == 0x80) return -1; //overlong
if (*s == 0xed && (s[1] & 0xe0) == 0xa0) return -1; //surrogate
*cp = *s & 0x0f; goto u3;
}
if (*s < 0xf5) {
if (*s == 0xf0 && (s[1] & 0xf0) == 0x80) return -1; //overlong
if (*s == 0xf4 && (s[1] > 0x8f)) return -1; //too high
*cp = *s & 0x07; goto u4;
}
return -1;

u4: if ((*++s & 0xc0) != 0x80) return -1; *cp = (*cp << 6) | (*s & 0x3f);
Expand Down

0 comments on commit 3e6f804

Please sign in to comment.