-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutf8.c
128 lines (113 loc) · 3.7 KB
/
utf8.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*
* $Id$
*
* Copyright (c) 2006-2010 Oskari Saarenmaa <[email protected]>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stddef.h>
#include <stdlib.h>
#include "utf8.h"
// decodes a single utf8 character into a single latin9 byte (or '?')
// returns the number of bytes decoded
int fg_utf8_decode_char(const unsigned char *str, size_t len, unsigned char *out)
{
unsigned int c = (unsigned int) (str[0]), b = 1;
if (c >= 0xf0) /* four-byte utf8 character */
b = 4;
else if (c >= 0xe0) /* three-byte utf8 character */
b = 3;
else if (c >= 0xc0) /* two-byte utf8 character */
b = 2;
else /* one-byte utf8 character, direct mapping to iso-8859-1 */
{
*out = *str;
return 1;
}
if (b > len) /* buffer underrun */
{
*out = '?';
return len;
}
c = (str[b-1] & 63) | ((str[b-2] & 63) << 6);
if (b > 2)
c |= (str[b-3] & 63) << 12;
if (b > 3)
c |= (str[b-4] & 63) << 18;
if (c <= 0xff) /* 8 bit character */
*out = (unsigned char) c; /* direct mapping */
else if (c == 0x220ac) /* utf8 euro (U+20AC) */
*out = (unsigned char) 0xa4; /* latin9 euro */
else /* character does not fit in iso-8859-1 space */
*out = (unsigned char) '?'; /* everything else becomes a question mark */
return b;
}
unsigned char *fg_utf8_decode(const unsigned char *str, size_t *szp)
{
size_t sz = *szp;
unsigned char *out = malloc(sz+1), *op = out;
unsigned int x;
for (x=0; x<sz;)
{
x += fg_utf8_decode_char(str + x, sz - x, op);
op += 1;
}
*op = '\0';
*szp = op-out;
return out;
}
// encode a single character, out must be at least 4 bytes long.
// returns the number of bytes written.
int fg_utf8_encode_char(unsigned char c, unsigned char *out)
{
unsigned char *op = out;
if (c == 0xa4) /* latin9 euro - not direct mapped */
{
*op++ = 0xe2; /* hardcoded utf8 encoding for euro */
*op++ = 0x82;
*op++ = 0xac;
}
else if (c >= 0x80) /* other latin9 characters should be representable like this.. */
{
*op++ = 0xc0 | (c >> 6);
*op++ = 0x80 | (c & 63);
}
else /* direct mapping in utf8 */
{
*op++ = c;
}
return op - out;
}
unsigned char *fg_utf8_encode(const unsigned char *str, size_t *szp)
{
size_t sz = *szp;
unsigned char *out = malloc(sz*3+1), *op=out;
unsigned int x;
for (x=0; x<sz; x++)
{
op += fg_utf8_encode_char(str[x], op);
}
*op = '\0';
*szp = op-out;
return out;
}