forked from koreader/koreader-base
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utf8proc.lua
111 lines (95 loc) · 3.06 KB
/
utf8proc.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
--[[--
Module for utf8 string operations.
This is a LuaJIT FFI wrapper for utf8proc.
@module ffi.utf8proc
]]
local ffi = require("ffi")
local C = ffi.C
require("ffi/posix_h")
require("ffi/utf8proc_h")
local libutf8proc = ffi.loadlib("utf8proc", "3")
local Utf8Proc = {}
--- Lowercases an utf8-encoded string
--- @string str string to lowercase
--- @bool normalize normalizes the string during operation
--- @treturn string the lowercased string
function Utf8Proc.lowercase(str, normalize)
if normalize == nil then normalize = true end
if normalize then
return Utf8Proc.lowercase_NFKC_Casefold(str)
else
return Utf8Proc.cased_dumb(str, true)
end
end
-- with normalization
function Utf8Proc.lowercase_NFKC_Casefold(str)
local folded_strz = libutf8proc.utf8proc_NFKC_Casefold(str)
local folded_str = ffi.string(folded_strz)
C.free(folded_strz)
return folded_str
end
-- no normalization here
function Utf8Proc.lowercase_dumb(str)
return Utf8Proc.cased_dumb(str, true)
end
function Utf8Proc.uppercase_dumb(str)
return Utf8Proc.cased_dumb(str, false)
end
function Utf8Proc.cased_dumb(str, is_lower)
local cased = ""
local tmp_str = (" "):rep(10)
local tmp_p = ffi.cast("utf8proc_uint8_t *", tmp_str)
local str_p = ffi.cast("const utf8proc_uint8_t *", str)
local codepoint = ffi.new("utf8proc_int32_t[1]")
local count = 0
local pos = 0
local str_len = #str -- may contain NUL
while pos < str_len do
-- get codepoint
local bytes = libutf8proc.utf8proc_iterate(str_p + pos, -1, codepoint)
-- cased codepoint
local cp = is_lower and libutf8proc.utf8proc_tolower(codepoint[0]) or libutf8proc.utf8proc_toupper(codepoint[0])
-- encode cased codepoint and get length of new char*
local len = libutf8proc.utf8proc_encode_char(cp, tmp_p)
tmp_p[len] = 0
-- append
cased = cased .. ffi.string(tmp_p)
if bytes > 0 then
count = count + 1
pos = pos + bytes
else
return cased
end
end
return cased
end
--- Normalizes an utf8-encoded string
--- @string str string to lowercase
--- @treturn string the normalized string
function Utf8Proc.normalize_NFC(str)
local normalized_strz = libutf8proc.utf8proc_NFC(str)
local normalized_str = ffi.string(normalized_strz)
C.free(normalized_strz)
return normalized_str
end
--- Counts codepoints in an utf8-encoded string
--- @string str to count codepoints in
--- @return (int, bool) number of codepoints, operation successfull
function Utf8Proc.count(str)
local str_p = ffi.cast("const utf8proc_uint8_t *", str)
local codepoint = ffi.new("utf8proc_int32_t[1]")
local count = 0
local pos = 0
local str_len = #str -- may contain NUL
while pos < str_len do
local bytes = libutf8proc.utf8proc_iterate(str_p + pos, -1, codepoint)
if bytes > 0 then
count = count + 1
pos = pos + bytes
else
return count, false
end
end
return count, true
end
return Utf8Proc