Skip to content

Commit

Permalink
Bug#57737 Character sets: search fails with like, contraction, index
Browse files Browse the repository at this point in the history
Problem: LIKE over an indexed column optimized away good results,
because my_like_range_utf32/utf16 returned wrong ranges for contractions.
Contraction related code was missing in my_like_range_utf32/utf16,
but did exist in my_like_range_ucs2/utf8.
It was forgotten in utf32/utf16 versions (during mysql-6.0 push/revert mess).

Fix:
The patch removes individual functions my_like_range_ucs2,
my_like_range_utf16, my_like_range_utf32 and introduces a single function
my_like_range_generic() instead. The new function handles contractions
correctly. It can handle any character set with cs->min_sort_char and
cs->max_sort_char represented in Unicode code points.

added:
  @ mysql-test/include/ctype_czech.inc
  @ mysql-test/include/ctype_like_ignorable.inc
  @ mysql-test/r/ctype_like_range.result
  @ mysql-test/t/ctype_like_range.test
  Adding tests


modified:

  @ include/m_ctype.h
  - Adding helper functions for contractions.
  - Prototypes: removing ucs2,utf16,utf32 functions, adding generic function.
  @ mysql-test/r/ctype_uca.result
  @ mysql-test/r/ctype_utf16_uca.result
  @ mysql-test/r/ctype_utf32_uca.result
  @ mysql-test/t/ctype_uca.test
  @ mysql-test/t/ctype_utf16_uca.test
  @ mysql-test/t/ctype_utf32_uca.test
  - Adding tests.

  @ strings/ctype-mb.c
  - Pad function did not put the last character.
  - Implementing my_like_range_generic() - an universal replacement
    for three separate functions
    my_like_range_ucs2(), my_like_range_utf16() and my_like_range_utf32(),
    with correct contraction handling.

  @ strings/ctype-ucs2.c
  - my_fill_mb2 did not put the high byte, as previously
    it was used to put only characters in ASCII range.
    Now it puts high byte as well
    (needed to pupulate cs->max_sort_char correctly).
  - Adding DBUG_ASSERT()
  - Removing character set specific functions:
    my_like_range_ucs2(), my_like_range_utf16() and my_like_range_utf32().
  - Using my_like_range_generic() instead of the old functions.

  @ strings/ctype-uca.c
  - Using generic function instead of the old character set specific ones.

  @ sql/item_create.cc
  @ sql/item_strfunc.cc
  @ sql/item_strfunc.h
  - Adding SQL functions LIKE_RANGE_MIN and LIKE_RANGE_MAX,
    available only in debug build to make sure like_range()
    works correctly for all character sets and collations.
  • Loading branch information
Alexander Barkov committed Nov 26, 2010
1 parent 7d27f9c commit 93386d7
Show file tree
Hide file tree
Showing 17 changed files with 3,001 additions and 344 deletions.
55 changes: 35 additions & 20 deletions include/m_ctype.h
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,32 @@ extern CHARSET_INFO my_charset_utf8mb4_unicode_ci;
#define MY_UTF8MB4 "utf8mb4"


/* Helper functions to handle contraction */
static inline my_bool
my_cs_have_contractions(CHARSET_INFO *cs)
{
return cs->contractions != NULL;
}

static inline my_bool
my_cs_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc)
{
return ((const char *)cs->contractions)[0x40*0x40 + (wc & 0xFF)];
}

static inline my_bool
my_cs_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc)
{
return ((const char *)cs->contractions)[0x40*0x40 + (wc & 0xFF)];
}

static inline uint16*
my_cs_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
{
return &cs->contractions[(wc1 - 0x40) * 0x40 + wc2 - 0x40];
}


/* declarations for simple charsets */
extern size_t my_strnxfrm_simple(CHARSET_INFO *, uchar *, size_t,
const uchar *, size_t);
Expand Down Expand Up @@ -430,40 +456,29 @@ ulonglong my_strntoull10rnd_ucs2(CHARSET_INFO *cs,

void my_fill_8bit(CHARSET_INFO *cs, char* to, size_t l, int fill);

/* For 8-bit character set */
my_bool my_like_range_simple(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str, char *max_str,
size_t *min_length, size_t *max_length);

/* For ASCII-based multi-byte character sets with mbminlen=1 */
my_bool my_like_range_mb(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str, char *max_str,
size_t *min_length, size_t *max_length);

my_bool my_like_range_ucs2(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str, char *max_str,
size_t *min_length, size_t *max_length);

my_bool my_like_range_utf16(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str, char *max_str,
size_t *min_length, size_t *max_length);

my_bool my_like_range_utf32(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str, char *max_str,
size_t *min_length, size_t *max_length);
/* For other character sets, with arbitrary mbminlen and mbmaxlen numbers */
my_bool my_like_range_generic(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str, char *max_str,
size_t *min_length, size_t *max_length);

int my_wildcmp_8bit(CHARSET_INFO *,
const char *str,const char *str_end,
Expand Down
12 changes: 12 additions & 0 deletions mysql-test/include/ctype_czech.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
SELECT @@collation_connection;
--echo #
--echo # Bug#57737 Character sets: search fails with like, contraction, index
--echo #
CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
INSERT INTO t1 VALUES ('c'),('ce'),(''),('ch');
SELECT * FROM t1 WHERE s1 LIKE 'c%';
ALTER TABLE t1 ADD KEY s1 (s1);
SELECT * FROM t1 WHERE s1 LIKE 'c%';
ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
SELECT * FROM t1 WHERE s1 LIKE 'ch';
DROP TABLE t1;
11 changes: 11 additions & 0 deletions mysql-test/include/ctype_like_ignorable.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT @@collation_connection;
--echo #
--echo # Bug#57737 Character sets: search fails with like, contraction, index
--echo # Part#2 - ignorable characters
--echo #
CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
INSERT INTO t1 VALUES ('a\0\0\0\0\0\t'),('a'),('b'),('c'),('d'),('e');
SELECT HEX(s1) FROM t1 WHERE s1 LIKE 'a%';
ALTER TABLE t1 ADD KEY s1 (s1);
SELECT HEX(s1) FROM t1 WHERE s1 LIKE 'a%';
DROP TABLE t1;
Loading

0 comments on commit 93386d7

Please sign in to comment.