forked from tesseract-ocr/tesseract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunicharmap.cpp
172 lines (149 loc) · 5.84 KB
/
unicharmap.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
///////////////////////////////////////////////////////////////////////
// File: unicharmap.cpp
// Description: Unicode character/ligature to integer id class.
// Author: Thomas Kielbus
// Created: Wed Jun 28 17:05:01 PDT 2006
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <assert.h>
#include "unichar.h"
#include "host.h"
#include "unicharmap.h"
UNICHARMAP::UNICHARMAP() :
nodes(0) {
}
UNICHARMAP::~UNICHARMAP() {
if (nodes != 0)
delete[] nodes;
}
// Search the given unichar representation in the tree. Each character in the
// string is interpreted as an index in an array of nodes.
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
assert(*unichar_repr != '\0');
do {
if (*(current_char + 1) == '\0')
return current_nodes[static_cast<unsigned char>(*current_char)].id;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
} while (true);
}
// Search the given unichar representation in the tree, using length characters
// from it maximum. Each character in the string is interpreted as an index in
// an array of nodes.
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
int length) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
assert(*unichar_repr != '\0');
assert(length > 0 && length <= UNICHAR_LEN);
do {
if (length == 1 || *(current_char + 1) == '\0')
return current_nodes[static_cast<unsigned char>(*current_char)].id;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
--length;
} while (true);
}
// Search the given unichar representation in the tree, creating the possibly
// missing nodes. Once the right place has been found, insert the given id and
// update the inserted flag to keep track of the insert. Each character in the
// string is interpreted as an index in an array of nodes.
void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
const char* current_char = unichar_repr;
UNICHARMAP_NODE** current_nodes_pointer = &nodes;
assert(*unichar_repr != '\0');
assert(id >= 0);
do {
if (*current_nodes_pointer == 0)
*current_nodes_pointer = new UNICHARMAP_NODE[256];
if (*(current_char + 1) == '\0') {
(*current_nodes_pointer)
[static_cast<unsigned char>(*current_char)].id = id;
return;
}
current_nodes_pointer =
&((*current_nodes_pointer)
[static_cast<unsigned char>(*current_char)].children);
++current_char;
} while (true);
}
// Search the given unichar representation in the tree. Each character in the
// string is interpreted as an index in an array of nodes. Stop once the tree
// does not have anymore nodes or once we found the right unichar_repr.
bool UNICHARMAP::contains(const char* const unichar_repr) const {
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
while (current_nodes != 0 && *(current_char + 1) != '\0') {
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
}
return current_nodes != 0 && *(current_char + 1) == '\0' &&
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
}
// Search the given unichar representation in the tree, using length characters
// from it maximum. Each character in the string is interpreted as an index in
// an array of nodes. Stop once the tree does not have anymore nodes or once we
// found the right unichar_repr.
bool UNICHARMAP::contains(const char* const unichar_repr,
int length) const {
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
if (length <= 0 || length > UNICHAR_LEN) return false;
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
--length;
++current_char;
}
return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
}
// Return the minimum number of characters that must be used from this string
// to obtain a match in the UNICHARMAP.
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
while (current_nodes != NULL && *current_char != '\0') {
if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
return current_char + 1 - unichar_repr;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
}
return 0;
}
void UNICHARMAP::clear() {
if (nodes != 0)
{
delete[] nodes;
nodes = 0;
}
}
UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
children(0),
id(-1) {
}
// Recursively delete the children
UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
if (children != 0) {
delete[] children;
}
}