Skip to content

Commit 0a3d1d1

Browse files
committed
Refactoring for MDEV-27042 and MDEV-27009
This patch prepares the code for upcoming changes: MDEV-27009 Add UCA-14.0.0 collations MDEV-27042 UCA: Resetting contractions to ignorable does not work well 1. Adding "const" qualifiers to return type and parameters in functions: - my_uca_contraction2_weight() - my_wmemcmp() - my_uca_contraction_weight() - my_uca_scanner_contraction_find() - my_uca_previous_context_find() - my_uca_context_weight_find() 2. Adding a helper function my_uca_true_contraction_eq() 3. Changing the way how scanner->wbeg is set during context weight handling. It was previously set inside functions: - my_uca_scanner_contraction_find() - my_uca_previous_context_find() Now it's set inside scanner_next(), which makes the code more symmetric for context-free and context-dependent sequences. This makes then upcoming fix for MDEV-27042 simpler.
1 parent 86891b8 commit 0a3d1d1

File tree

3 files changed

+59
-39
lines changed

3 files changed

+59
-39
lines changed

include/m_ctype.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ typedef struct my_contraction_list_t
135135

136136
my_bool my_uca_can_be_contraction_head(const MY_CONTRACTIONS *c, my_wc_t wc);
137137
my_bool my_uca_can_be_contraction_tail(const MY_CONTRACTIONS *c, my_wc_t wc);
138-
uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
139-
my_wc_t wc1, my_wc_t wc2);
138+
const uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
139+
my_wc_t wc1, my_wc_t wc2);
140140

141141

142142
/* Collation weights on a single level (e.g. primary, secondary, tertiarty) */

strings/ctype-uca.c

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -31358,7 +31358,7 @@ my_uca_can_be_contraction_part(const MY_CONTRACTIONS *c, my_wc_t wc, int flag)
3135831358
@retval ptr - contraction weight array
3135931359
*/
3136031360

31361-
uint16 *
31361+
const uint16 *
3136231362
my_uca_contraction2_weight(const MY_CONTRACTIONS *list, my_wc_t wc1, my_wc_t wc2)
3136331363
{
3136431364
MY_CONTRACTION *c, *last;
@@ -31443,13 +31443,29 @@ my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc)
3144331443
@retval non-zero - strings are different
3144431444
*/
3144531445

31446-
static int
31447-
my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len)
31446+
static inline int
31447+
my_wmemcmp(const my_wc_t *a, const my_wc_t *b, size_t len)
3144831448
{
3144931449
return memcmp(a, b, len * sizeof(my_wc_t));
3145031450
}
3145131451

3145231452

31453+
/*
31454+
Test if the MY_CONTRACTION instance is equal to the wide
31455+
string with the given length.
31456+
Note, only true contractions are checked,
31457+
while previous context pairs always return FALSE.
31458+
*/
31459+
static inline my_bool
31460+
my_uca_true_contraction_eq(const MY_CONTRACTION *c,
31461+
const my_wc_t *wc, size_t len)
31462+
{
31463+
return (len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) &&
31464+
!c->with_context &&
31465+
!my_wmemcmp(c->ch, wc, len);
31466+
}
31467+
31468+
3145331469
/**
3145431470
Check if a string is a contraction,
3145531471
and return its weight array on success.
@@ -31463,17 +31479,15 @@ my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len)
3146331479
@retval ptr - contraction weight array
3146431480
*/
3146531481

31466-
static inline uint16 *
31482+
static inline const uint16 *
3146731483
my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
3146831484
{
3146931485
MY_CONTRACTION *c, *last;
3147031486
DBUG_ASSERT(len <= MY_UCA_MAX_CONTRACTION);
3147131487

3147231488
for (c= list->item, last= c + list->nitems; c < last; c++)
3147331489
{
31474-
if ((len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) &&
31475-
!c->with_context &&
31476-
!my_wmemcmp(c->ch, wc, len))
31490+
if (my_uca_true_contraction_eq(c, wc, len))
3147731491
return c->weight;
3147831492
}
3147931493
return NULL;
@@ -31495,12 +31509,15 @@ my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
3149531509
@retval ptr - contraction weight array
3149631510
*/
3149731511

31498-
static uint16 *
31499-
my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc)
31512+
static const uint16 *
31513+
my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc)
3150031514
{
3150131515
size_t clen= 1;
3150231516
int flag;
3150331517
const uchar *s, *beg[MY_UCA_MAX_CONTRACTION];
31518+
my_wc_t wc[MY_UCA_MAX_CONTRACTION];
31519+
wc[0]= currwc;
31520+
3150431521
memset((void*) beg, 0, sizeof(beg));
3150531522

3150631523
/* Scan all contraction candidates */
@@ -31520,13 +31537,12 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc)
3152031537
/* Find among candidates the longest real contraction */
3152131538
for ( ; clen > 1; clen--)
3152231539
{
31523-
uint16 *cweight;
31540+
const uint16 *cweight;
3152431541
if (my_uca_can_be_contraction_tail(&scanner->level->contractions,
3152531542
wc[clen - 1]) &&
3152631543
(cweight= my_uca_contraction_weight(&scanner->level->contractions,
3152731544
wc, clen)))
3152831545
{
31529-
scanner->wbeg= cweight + 1;
3153031546
scanner->sbeg= beg[clen - 1];
3153131547
return cweight;
3153231548
}
@@ -31549,19 +31565,15 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc)
3154931565
@retval ptr - contraction weight array
3155031566
*/
3155131567

31552-
static uint16 *
31553-
my_uca_previous_context_find(my_uca_scanner *scanner,
31568+
static const uint16 *
31569+
my_uca_previous_context_find(const MY_CONTRACTIONS *list,
3155431570
my_wc_t wc0, my_wc_t wc1)
3155531571
{
31556-
const MY_CONTRACTIONS *list= &scanner->level->contractions;
3155731572
MY_CONTRACTION *c, *last;
3155831573
for (c= list->item, last= c + list->nitems; c < last; c++)
3155931574
{
3156031575
if (c->with_context && wc0 == c->ch[0] && wc1 == c->ch[1])
31561-
{
31562-
scanner->wbeg= c->weight + 1;
3156331576
return c->weight;
31564-
}
3156531577
}
3156631578
return NULL;
3156731579
}
@@ -31584,10 +31596,11 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
3158431596
@retval NULL if could not find any contextual weights for wc[0]
3158531597
@retval non null pointer to a zero-terminated weight string otherwise
3158631598
*/
31587-
static inline uint16 *
31588-
my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc)
31599+
static inline const uint16 *
31600+
my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc)
3158931601
{
31590-
uint16 *cweight;
31602+
const uint16 *cweight;
31603+
my_wc_t prevwc;
3159131604
DBUG_ASSERT(scanner->level->contractions.nitems);
3159231605
/*
3159331606
If we have scanned a character which can have previous context,
@@ -31599,21 +31612,22 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc)
3159931612
context at the moment. CLDR does not have longer sequences.
3160031613
*/
3160131614
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
31602-
wc[0]) &&
31615+
currwc) &&
3160331616
scanner->wbeg != nochar && /* if not the very first character */
3160431617
my_uca_can_be_previous_context_head(&scanner->level->contractions,
31605-
(wc[1]= ((scanner->page << 8) +
31618+
(prevwc= ((scanner->page << 8) +
3160631619
scanner->code))) &&
31607-
(cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
31620+
(cweight= my_uca_previous_context_find(&scanner->level->contractions,
31621+
prevwc, currwc)))
3160831622
{
3160931623
scanner->page= scanner->code= 0; /* Clear for the next character */
3161031624
return cweight;
3161131625
}
3161231626
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
31613-
wc[0]))
31627+
currwc))
3161431628
{
3161531629
/* Check if w[0] starts a contraction */
31616-
if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
31630+
if ((cweight= my_uca_scanner_contraction_find(scanner, currwc)))
3161731631
return cweight;
3161831632
}
3161931633
return NULL;

strings/ctype-uca.ic

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,28 +52,31 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
5252
do
5353
{
5454
const uint16 *wpage;
55-
my_wc_t wc[MY_UCA_MAX_CONTRACTION];
5655
int mblen;
56+
my_wc_t currwc;
5757

5858
/* Get next character */
5959
#if MY_UCA_ASCII_OPTIMIZE
6060
/* Get next ASCII character */
6161
if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
6262
{
63-
wc[0]= scanner->sbeg[0];
63+
currwc= scanner->sbeg[0];
6464
scanner->sbeg+= 1;
6565

6666
#if MY_UCA_COMPILE_CONTRACTIONS
67-
if (my_uca_needs_context_handling(scanner->level, wc[0]))
67+
if (my_uca_needs_context_handling(scanner->level, currwc))
6868
{
69-
uint16 *cweight= my_uca_context_weight_find(scanner, wc);
69+
const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
7070
if (cweight)
71+
{
72+
scanner->wbeg= cweight + 1;
7173
return *cweight;
74+
}
7275
}
7376
#endif
7477

7578
scanner->page= 0;
76-
scanner->code= (int) wc[0];
79+
scanner->code= (int) currwc;
7780
scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
7881
if (scanner->wbeg[0])
7982
return *scanner->wbeg++;
@@ -82,8 +85,8 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
8285
else
8386
#endif
8487
/* Get next MB character */
85-
if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
86-
scanner->send)) <= 0))
88+
if (((mblen= MY_MB_WC(scanner, &currwc, scanner->sbeg,
89+
scanner->send)) <= 0))
8790
{
8891
if (scanner->sbeg >= scanner->send)
8992
return -1; /* No more bytes, end of line reached */
@@ -105,25 +108,28 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
105108
}
106109

107110
scanner->sbeg+= mblen;
108-
if (wc[0] > scanner->level->maxchar)
111+
if (currwc > scanner->level->maxchar)
109112
{
110113
/* Return 0xFFFD as weight for all characters outside BMP */
111114
scanner->wbeg= nochar;
112115
return 0xFFFD;
113116
}
114117

115118
#if MY_UCA_COMPILE_CONTRACTIONS
116-
if (my_uca_needs_context_handling(scanner->level, wc[0]))
119+
if (my_uca_needs_context_handling(scanner->level, currwc))
117120
{
118-
uint16 *cweight= my_uca_context_weight_find(scanner, wc);
121+
const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
119122
if (cweight)
123+
{
124+
scanner->wbeg= cweight + 1;
120125
return *cweight;
126+
}
121127
}
122128
#endif
123129

124130
/* Process single character */
125-
scanner->page= wc[0] >> 8;
126-
scanner->code= wc[0] & 0xFF;
131+
scanner->page= currwc >> 8;
132+
scanner->code= currwc & 0xFF;
127133

128134
/* If weight page for w[0] does not exist, then calculate algoritmically */
129135
if (!(wpage= scanner->level->weights[scanner->page]))

0 commit comments

Comments
 (0)