Skip to content

Commit ced3a93

Browse files
committed
Fix assorted bugs in contrib/unaccent's configuration file parsing.
Make it use t_isspace() to identify whitespace, rather than relying on sscanf which is known to get it wrong on some platform/locale combinations. Get rid of fixed-size buffers. Make it actually continue to parse the file after ignoring a line with untranslatable characters, as was obviously intended. The first of these issues is per gripe from J Smith, though not exactly either of his proposed patches.
1 parent ffc703a commit ced3a93

File tree

1 file changed

+67
-19
lines changed

1 file changed

+67
-19
lines changed

contrib/unaccent/unaccent.c

+67-19
Original file line numberDiff line numberDiff line change
@@ -91,35 +91,83 @@ initSuffixTree(char *filename)
9191

9292
do
9393
{
94-
char src[4096];
95-
char trg[4096];
96-
int srclen;
97-
int trglen;
98-
char *line = NULL;
99-
94+
/*
95+
* pg_do_encoding_conversion() (called by tsearch_readline()) will
96+
* emit exception if it finds untranslatable characters in current
97+
* locale. We just skip such lines, continuing with the next.
98+
*/
10099
skip = true;
101100

102101
PG_TRY();
103102
{
104-
/*
105-
* pg_do_encoding_conversion() (called by tsearch_readline()) will
106-
* emit exception if it finds untranslatable characters in current
107-
* locale. We just skip such characters.
108-
*/
103+
char *line;
104+
109105
while ((line = tsearch_readline(&trst)) != NULL)
110106
{
111-
if (sscanf(line, "%s\t%s\n", src, trg) != 2)
112-
continue;
107+
/*
108+
* The format of each line must be "src trg" where src and trg
109+
* are sequences of one or more non-whitespace characters,
110+
* separated by whitespace. Whitespace at start or end of
111+
* line is ignored.
112+
*/
113+
int state;
114+
char *ptr;
115+
char *src = NULL;
116+
char *trg = NULL;
117+
int ptrlen;
118+
int srclen = 0;
119+
int trglen = 0;
120+
121+
state = 0;
122+
for (ptr = line; *ptr; ptr += ptrlen)
123+
{
124+
ptrlen = pg_mblen(ptr);
125+
/* ignore whitespace, but end src or trg */
126+
if (t_isspace(ptr))
127+
{
128+
if (state == 1)
129+
state = 2;
130+
else if (state == 3)
131+
state = 4;
132+
continue;
133+
}
134+
switch (state)
135+
{
136+
case 0:
137+
/* start of src */
138+
src = ptr;
139+
srclen = ptrlen;
140+
state = 1;
141+
break;
142+
case 1:
143+
/* continue src */
144+
srclen += ptrlen;
145+
break;
146+
case 2:
147+
/* start of trg */
148+
trg = ptr;
149+
trglen = ptrlen;
150+
state = 3;
151+
break;
152+
case 3:
153+
/* continue trg */
154+
trglen += ptrlen;
155+
break;
156+
default:
157+
/* bogus line format */
158+
state = -1;
159+
break;
160+
}
161+
}
113162

114-
srclen = strlen(src);
115-
trglen = strlen(trg);
163+
if (state >= 3)
164+
rootSuffixTree = placeChar(rootSuffixTree,
165+
(unsigned char *) src, srclen,
166+
trg, trglen);
116167

117-
rootSuffixTree = placeChar(rootSuffixTree,
118-
(unsigned char *) src, srclen,
119-
trg, trglen);
120-
skip = false;
121168
pfree(line);
122169
}
170+
skip = false;
123171
}
124172
PG_CATCH();
125173
{

0 commit comments

Comments
 (0)