forked from oils-for-unix/oils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutf8.h
170 lines (139 loc) · 4.84 KB
/
utf8.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#ifndef DATA_LANG_UTF8_H
#define DATA_LANG_UTF8_H
#include <stddef.h> // size_t
#include <stdint.h> // uint32_t
#include <stdio.h>
/**
* ---- Quick reference about the encoding ----
*
* First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The
* first byte determines the length of the sequence and then the next 0-3 bytes
* are "continuation bytes."
*
* +----------------------------+----------+----------+----------+----------+
* | Scalar Value | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
* +----------------------------+----------+----------+----------+----------+
* | 00000000 0xxxxxxx | 0xxxxxxx | | | |
* | 00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | | |
* | zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | |
* | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
* +----------------------------+----------+----------+----------+----------+
*
* Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns
*
* There are 3 further restrictions which make some valid bit patterns
* *invalid*:
* 1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second
* sequence is longer and thus an error.
* 2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a
* surrogate. It is an error to encode surrogates in UTF-8.
* 3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint,
* and must be rejected as an error.
*
* See https://aolsen.ca/writings/everything-about-utf8 for more details about
* the encoding.
*/
typedef enum Utf8Error {
UTF8_OK = 0,
// Encodes a codepoint in more bytes than necessary
UTF8_ERR_OVERLONG = 1,
// Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive)
UTF8_ERR_SURROGATE = 2,
// Encodes a value greater than the max codepoint U+10FFFF
UTF8_ERR_TOO_LARGE = 3,
// Encoding doesn't conform to the UTF-8 bit patterns
UTF8_ERR_BAD_ENCODING = 4,
// It looks like there is another codepoint, but it has been truncated.
UTF8_ERR_TRUNCATED_BYTES = 5,
} Utf8Error_t;
typedef struct Utf8Result {
Utf8Error_t error;
uint32_t codepoint;
size_t bytes_read;
} Utf8Result_t;
static inline void _cont(const unsigned char *input, Utf8Result_t *result) {
if (result->error) return;
int byte = input[result->bytes_read];
if (byte == '\0') {
result->error = UTF8_ERR_TRUNCATED_BYTES;
return;
}
result->bytes_read += 1;
// Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
// validate the pattern and b) remove the leading '10'.
if ((byte & 0xC0) == 0x80) {
result->codepoint <<= 6;
result->codepoint |= byte & 0x3F;
} else {
result->error = UTF8_ERR_BAD_ENCODING;
}
}
/**
* Given a nul-terminated string `input`, try to decode the next codepoint from
* that string.
*
* It is required that `input` does not point to the nul-terminator. If
* `*input == '\0'`, then it is assumed that the zero-byte is meant to encode
* U+00, not a sentinel. The nul-terminator is still necessary because we need
* it to prevent buffer overrun in the case of a truncated byte sequence, for
* example '\xC2'. This oddity is to facilitate strings which may contain U+00
* codepoints.
*
* If there was a surrogate, overlong or codepoint to large error then
* `result.codepoint` will contain the recovered value.
*/
static inline void utf8_decode(const unsigned char *input,
Utf8Result_t *result) {
result->error = UTF8_OK;
result->codepoint = 0;
result->bytes_read = 0;
int first = *input;
result->bytes_read = 1;
if ((first & 0x80) == 0) {
// 1-byte long (ASCII subset)
result->codepoint = first;
return;
}
if ((first & 0xE0) == 0xC0) {
// 2-bytes long
result->codepoint = first & 0x1F;
_cont(input, result);
if (result->error) return;
if (result->codepoint < 0x80) {
result->error = UTF8_ERR_OVERLONG;
}
return;
}
if ((first & 0xF0) == 0xE0) {
// 3-bytes long
result->codepoint = first & 0x0F;
_cont(input, result);
_cont(input, result);
if (result->error) return;
if (result->codepoint < 0x800) {
result->error = UTF8_ERR_OVERLONG;
}
if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
result->error = UTF8_ERR_SURROGATE;
}
return;
}
if ((first & 0xF8) == 0xF0) {
// 4-bytes long
result->codepoint = first & 0x07;
_cont(input, result);
_cont(input, result);
_cont(input, result);
if (result->error) return;
if (result->codepoint < 0x10000) {
result->error = UTF8_ERR_OVERLONG;
}
if (result->codepoint > 0x10FFFF) {
result->error = UTF8_ERR_TOO_LARGE;
}
return;
}
result->error = UTF8_ERR_BAD_ENCODING;
return;
}
#endif // DATA_LANG_UTF8_H