Skip to content

Commit

Permalink
MDEV-25904 New collation functions to compare InnoDB style trimmed NO…
Browse files Browse the repository at this point in the history
… PAD strings
  • Loading branch information
abarkov committed Jan 21, 2022
1 parent db57417 commit b915f79
Show file tree
Hide file tree
Showing 25 changed files with 1,150 additions and 144 deletions.
54 changes: 54 additions & 0 deletions include/m_ctype.h
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,60 @@ struct my_collation_handler_st
const uchar *, size_t, const uchar *, size_t, my_bool);
int (*strnncollsp)(CHARSET_INFO *,
const uchar *, size_t, const uchar *, size_t);
/*
strnncollsp_nchars() - similar to strnncollsp() but assumes that both
strings were originally CHAR(N) values with the
same N, then were optionally space-padded,
or optionally space-trimmed.
In other words, this function compares in the way
if we insert both values into a CHAR(N) column
and then compare the two column values.
It compares the same amount of characters from the two strings.
This is especially important for NOPAD collations.
If CHAR_LENGTH of the two strings are different,
the shorter string is virtually padded with trailing spaces
up to CHAR_LENGTH of the longer string, to guarantee that the
same amount of characters are compared.
This is important if the two CHAR(N) strings are space-trimmed
(e.g. like in InnoDB compact format for CHAR).
The function compares not more than "nchars" characters only.
This can be useful to compare CHAR(N) space-padded strings
(when the exact N is known) without having to truncate them before
the comparison.
For example, Field_string stores a "CHAR(3) CHARACTER SET utf8mb4" value
of "aaa" as 12 bytes in a record buffer:
- 3 bytes of the actual data, followed by
- 9 bytes of spaces (just fillers, not real data)
The caller can pass nchars=3 to compare CHAR(3) record values.
In such case, the comparator won't go inside the 9 bytes of the fillers.
If N is not known, the caller can pass max(len1,len2) as the "nchars" value
(i.e. the maximum of the OCTET_LENGTH of the two strings).
Notes on complex collations.
This function counts contraction parts as individual characters.
For example, the Czech letter 'ch' (in Czech collations)
is ordinarily counted by the "nchars" limit as TWO characters
(although it is only one letter).
This corresponds to what CHAR(N) does in INSERT.
If the "nchars" limit tears apart a contraction, only the part fitting
into "nchars" characters is used. For example, in case of a Czech collation,
the string "ach" with nchars=2 is compared as 'ac': the contraction
'ch' is torn apart and the letter 'c' acts as an individual character.
This emulates the same comparison result with the scenario when we insert
'ach' into a CHAR(2) column and then compare it.
*/
int (*strnncollsp_nchars)(CHARSET_INFO *,
const uchar *str1, size_t len1,
const uchar *str2, size_t len2,
size_t nchars);
size_t (*strnxfrm)(CHARSET_INFO *,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags);
Expand Down
47 changes: 14 additions & 33 deletions sql/field.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7433,23 +7433,10 @@ Field_string::compatible_field_size(uint field_metadata,

int Field_string::cmp(const uchar *a_ptr, const uchar *b_ptr)
{
size_t a_len, b_len;

if (field_charset->mbmaxlen != 1)
{
size_t char_len= field_length/field_charset->mbmaxlen;
a_len= my_charpos(field_charset, a_ptr, a_ptr + field_length, char_len);
b_len= my_charpos(field_charset, b_ptr, b_ptr + field_length, char_len);
}
else
a_len= b_len= field_length;
/*
We have to remove end space to be able to compare multi-byte-characters
like in latin_de 'ae' and 0xe4
*/
return field_charset->coll->strnncollsp(field_charset,
a_ptr, a_len,
b_ptr, b_len);
return field_charset->coll->strnncollsp_nchars(field_charset,
a_ptr, field_length,
b_ptr, field_length,
Field_string::char_length());
}


Expand Down Expand Up @@ -7848,19 +7835,6 @@ int Field_varstring::cmp(const uchar *a_ptr, const uchar *b_ptr)
}


static int cmp_str_prefix(const uchar *ua, size_t alen, const uchar *ub,
size_t blen, size_t prefix, CHARSET_INFO *cs)
{
const char *a= (char*)ua, *b= (char*)ub;
MY_STRCOPY_STATUS status;
prefix/= cs->mbmaxlen;
alen= cs->cset->well_formed_char_length(cs, a, a + alen, prefix, &status);
blen= cs->cset->well_formed_char_length(cs, b, b + blen, prefix, &status);
return cs->coll->strnncollsp(cs, ua, alen, ub, blen);
}



int Field_varstring::cmp_prefix(const uchar *a_ptr, const uchar *b_ptr,
size_t prefix_len)
{
Expand All @@ -7880,8 +7854,12 @@ int Field_varstring::cmp_prefix(const uchar *a_ptr, const uchar *b_ptr,
a_length= uint2korr(a_ptr);
b_length= uint2korr(b_ptr);
}
return cmp_str_prefix(a_ptr+length_bytes, a_length, b_ptr+length_bytes,
b_length, prefix_len, field_charset);
return field_charset->coll->strnncollsp_nchars(field_charset,
a_ptr + length_bytes,
a_length,
b_ptr + length_bytes,
b_length,
prefix_len / field_charset->mbmaxlen);
}


Expand Down Expand Up @@ -8659,7 +8637,10 @@ int Field_blob::cmp_prefix(const uchar *a_ptr, const uchar *b_ptr,
memcpy(&blob1, a_ptr+packlength, sizeof(char*));
memcpy(&blob2, b_ptr+packlength, sizeof(char*));
size_t a_len= get_length(a_ptr), b_len= get_length(b_ptr);
return cmp_str_prefix(blob1, a_len, blob2, b_len, prefix_len, field_charset);
return field_charset->coll->strnncollsp_nchars(field_charset,
blob1, a_len,
blob2, b_len,
prefix_len / field_charset->mbmaxlen);
}


Expand Down
4 changes: 4 additions & 0 deletions strings/ctype-big5.c
Original file line number Diff line number Diff line change
Expand Up @@ -6711,6 +6711,7 @@ static MY_COLLATION_HANDLER my_collation_handler_big5_chinese_ci=
NULL, /* init */
my_strnncoll_big5_chinese_ci,
my_strnncollsp_big5_chinese_ci,
my_strnncollsp_nchars_big5_chinese_ci,
my_strnxfrm_big5_chinese_ci,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -6727,6 +6728,7 @@ static MY_COLLATION_HANDLER my_collation_handler_big5_bin=
NULL, /* init */
my_strnncoll_big5_bin,
my_strnncollsp_big5_bin,
my_strnncollsp_nchars_big5_bin,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -6743,6 +6745,7 @@ static MY_COLLATION_HANDLER my_collation_handler_big5_chinese_nopad_ci=
NULL, /* init */
my_strnncoll_big5_chinese_ci,
my_strnncollsp_big5_chinese_nopad_ci,
my_strnncollsp_nchars_big5_chinese_nopad_ci,
my_strnxfrm_big5_chinese_nopad_ci,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -6759,6 +6762,7 @@ static MY_COLLATION_HANDLER my_collation_handler_big5_nopad_bin=
NULL, /* init */
my_strnncoll_big5_bin,
my_strnncollsp_big5_nopad_bin,
my_strnncollsp_nchars_big5_nopad_bin,
my_strnxfrm_mb_nopad,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand Down
25 changes: 25 additions & 0 deletions strings/ctype-bin.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,17 @@ static int my_strnncollsp_binary(CHARSET_INFO * cs __attribute__((unused)),
}


static int my_strnncollsp_nchars_binary(CHARSET_INFO * cs __attribute__((unused)),
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
size_t nchars)
{
set_if_smaller(slen, nchars);
set_if_smaller(tlen, nchars);
return my_strnncoll_binary(cs, s, slen, t, tlen, 0);
}


static int my_strnncoll_8bit_bin(CHARSET_INFO * cs __attribute__((unused)),
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
Expand Down Expand Up @@ -199,6 +210,17 @@ static int my_strnncollsp_8bit_bin(CHARSET_INFO * cs __attribute__((unused)),
}


static int my_strnncollsp_nchars_8bit_bin(CHARSET_INFO * cs,
const uchar *a, size_t a_length,
const uchar *b, size_t b_length,
size_t nchars)
{
set_if_smaller(a_length, nchars);
set_if_smaller(b_length, nchars);
return my_strnncollsp_8bit_bin(cs, a, a_length, b, b_length);
}


static int my_strnncollsp_8bit_nopad_bin(CHARSET_INFO * cs
__attribute__((unused)),
const uchar *a, size_t a_length,
Expand Down Expand Up @@ -487,6 +509,7 @@ MY_COLLATION_HANDLER my_collation_8bit_bin_handler =
my_coll_init_8bit_bin,
my_strnncoll_8bit_bin,
my_strnncollsp_8bit_bin,
my_strnncollsp_nchars_8bit_bin,
my_strnxfrm_8bit_bin,
my_strnxfrmlen_simple,
my_like_range_simple,
Expand All @@ -503,6 +526,7 @@ MY_COLLATION_HANDLER my_collation_8bit_nopad_bin_handler =
my_coll_init_8bit_bin,
my_strnncoll_8bit_bin,
my_strnncollsp_8bit_nopad_bin,
my_strnncollsp_nchars_8bit_bin,
my_strnxfrm_8bit_nopad_bin,
my_strnxfrmlen_simple,
my_like_range_simple,
Expand All @@ -519,6 +543,7 @@ static MY_COLLATION_HANDLER my_collation_binary_handler =
NULL, /* init */
my_strnncoll_binary,
my_strnncollsp_binary,
my_strnncollsp_nchars_binary,
my_strnxfrm_8bit_bin,
my_strnxfrmlen_simple,
my_like_range_simple,
Expand Down
4 changes: 4 additions & 0 deletions strings/ctype-cp932.c
Original file line number Diff line number Diff line change
Expand Up @@ -34667,6 +34667,7 @@ static MY_COLLATION_HANDLER my_collation_handler_cp932_japanese_ci=
NULL, /* init */
my_strnncoll_cp932_japanese_ci,
my_strnncollsp_cp932_japanese_ci,
my_strnncollsp_nchars_cp932_japanese_ci,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -34683,6 +34684,7 @@ static MY_COLLATION_HANDLER my_collation_handler_cp932_bin=
NULL, /* init */
my_strnncoll_cp932_bin,
my_strnncollsp_cp932_bin,
my_strnncollsp_nchars_cp932_bin,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -34699,6 +34701,7 @@ static MY_COLLATION_HANDLER my_collation_handler_cp932_japanese_nopad_ci=
NULL, /* init */
my_strnncoll_cp932_japanese_ci,
my_strnncollsp_cp932_japanese_nopad_ci,
my_strnncollsp_nchars_cp932_japanese_nopad_ci,
my_strnxfrm_mb_nopad,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -34715,6 +34718,7 @@ static MY_COLLATION_HANDLER my_collation_handler_cp932_nopad_bin=
NULL, /* init */
my_strnncoll_cp932_bin,
my_strnncollsp_cp932_nopad_bin,
my_strnncollsp_nchars_cp932_nopad_bin,
my_strnxfrm_mb_nopad,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand Down
1 change: 1 addition & 0 deletions strings/ctype-czech.c
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,7 @@ static MY_COLLATION_HANDLER my_collation_latin2_czech_ci_handler =
NULL, /* init */
my_strnncoll_czech,
my_strnncollsp_czech,
my_strnncollsp_nchars_generic_8bit,
my_strnxfrm_czech,
my_strnxfrmlen_czech,
my_like_range_czech,
Expand Down
4 changes: 4 additions & 0 deletions strings/ctype-euc_kr.c
Original file line number Diff line number Diff line change
Expand Up @@ -9957,6 +9957,7 @@ static MY_COLLATION_HANDLER my_collation_handler_euckr_korean_ci=
NULL, /* init */
my_strnncoll_euckr_korean_ci,
my_strnncollsp_euckr_korean_ci,
my_strnncollsp_nchars_euckr_korean_ci,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -9973,6 +9974,7 @@ static MY_COLLATION_HANDLER my_collation_handler_euckr_bin=
NULL, /* init */
my_strnncoll_euckr_bin,
my_strnncollsp_euckr_bin,
my_strnncollsp_nchars_euckr_bin,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -9989,6 +9991,7 @@ static MY_COLLATION_HANDLER my_collation_handler_euckr_korean_nopad_ci=
NULL, /* init */
my_strnncoll_euckr_korean_ci,
my_strnncollsp_euckr_korean_nopad_ci,
my_strnncollsp_nchars_euckr_korean_nopad_ci,
my_strnxfrm_mb_nopad,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -10005,6 +10008,7 @@ static MY_COLLATION_HANDLER my_collation_handler_euckr_nopad_bin=
NULL, /* init */
my_strnncoll_euckr_bin,
my_strnncollsp_euckr_nopad_bin,
my_strnncollsp_nchars_euckr_nopad_bin,
my_strnxfrm_mb_nopad,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand Down
4 changes: 4 additions & 0 deletions strings/ctype-eucjpms.c
Original file line number Diff line number Diff line change
Expand Up @@ -67495,6 +67495,7 @@ static MY_COLLATION_HANDLER my_collation_eucjpms_japanese_ci_handler =
NULL, /* init */
my_strnncoll_eucjpms_japanese_ci,
my_strnncollsp_eucjpms_japanese_ci,
my_strnncollsp_nchars_eucjpms_japanese_ci,
my_strnxfrm_mb, /* strnxfrm */
my_strnxfrmlen_simple,
my_like_range_mb, /* like_range */
Expand All @@ -67511,6 +67512,7 @@ static MY_COLLATION_HANDLER my_collation_eucjpms_bin_handler =
NULL, /* init */
my_strnncoll_eucjpms_bin,
my_strnncollsp_eucjpms_bin,
my_strnncollsp_nchars_eucjpms_bin,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -67527,6 +67529,7 @@ static MY_COLLATION_HANDLER my_collation_eucjpms_japanese_nopad_ci_handler =
NULL, /* init */
my_strnncoll_eucjpms_japanese_ci,
my_strnncollsp_eucjpms_japanese_nopad_ci,
my_strnncollsp_nchars_eucjpms_japanese_nopad_ci,
my_strnxfrm_mb_nopad, /* strnxfrm */
my_strnxfrmlen_simple,
my_like_range_mb, /* like_range */
Expand All @@ -67543,6 +67546,7 @@ static MY_COLLATION_HANDLER my_collation_eucjpms_nopad_bin_handler =
NULL, /* init */
my_strnncoll_eucjpms_bin,
my_strnncollsp_eucjpms_nopad_bin,
my_strnncollsp_nchars_eucjpms_nopad_bin,
my_strnxfrm_mb_nopad,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand Down
4 changes: 4 additions & 0 deletions strings/ctype-gb2312.c
Original file line number Diff line number Diff line change
Expand Up @@ -6362,6 +6362,7 @@ static MY_COLLATION_HANDLER my_collation_handler_gb2312_chinese_ci=
NULL, /* init */
my_strnncoll_gb2312_chinese_ci,
my_strnncollsp_gb2312_chinese_ci,
my_strnncollsp_nchars_gb2312_chinese_ci,
my_strnxfrm_mb, /* strnxfrm */
my_strnxfrmlen_simple,
my_like_range_mb, /* like_range */
Expand All @@ -6378,6 +6379,7 @@ static MY_COLLATION_HANDLER my_collation_handler_gb2312_bin=
NULL, /* init */
my_strnncoll_gb2312_bin,
my_strnncollsp_gb2312_bin,
my_strnncollsp_nchars_gb2312_bin,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -6394,6 +6396,7 @@ static MY_COLLATION_HANDLER my_collation_handler_gb2312_chinese_nopad_ci=
NULL, /* init */
my_strnncoll_gb2312_chinese_ci,
my_strnncollsp_gb2312_chinese_nopad_ci,
my_strnncollsp_nchars_gb2312_chinese_nopad_ci,
my_strnxfrm_mb_nopad,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -6410,6 +6413,7 @@ static MY_COLLATION_HANDLER my_collation_handler_gb2312_nopad_bin=
NULL, /* init */
my_strnncoll_gb2312_bin,
my_strnncollsp_gb2312_nopad_bin,
my_strnncollsp_nchars_gb2312_nopad_bin,
my_strnxfrm_mb_nopad,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand Down
4 changes: 4 additions & 0 deletions strings/ctype-gbk.c
Original file line number Diff line number Diff line change
Expand Up @@ -10645,6 +10645,7 @@ static MY_COLLATION_HANDLER my_collation_handler_gbk_chinese_ci=
NULL, /* init */
my_strnncoll_gbk_chinese_ci,
my_strnncollsp_gbk_chinese_ci,
my_strnncollsp_nchars_gbk_chinese_ci,
my_strnxfrm_gbk_chinese_ci,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -10661,6 +10662,7 @@ static MY_COLLATION_HANDLER my_collation_handler_gbk_bin=
NULL, /* init */
my_strnncoll_gbk_bin,
my_strnncollsp_gbk_bin,
my_strnncollsp_nchars_gbk_bin,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -10677,6 +10679,7 @@ static MY_COLLATION_HANDLER my_collation_handler_gbk_chinese_nopad_ci=
NULL, /* init */
my_strnncoll_gbk_chinese_ci,
my_strnncollsp_gbk_chinese_nopad_ci,
my_strnncollsp_nchars_gbk_chinese_nopad_ci,
my_strnxfrm_gbk_chinese_nopad_ci,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand All @@ -10693,6 +10696,7 @@ static MY_COLLATION_HANDLER my_collation_handler_gbk_nopad_bin=
NULL, /* init */
my_strnncoll_gbk_bin,
my_strnncollsp_gbk_nopad_bin,
my_strnncollsp_nchars_gbk_nopad_bin,
my_strnxfrm_mb_nopad,
my_strnxfrmlen_simple,
my_like_range_mb,
Expand Down

0 comments on commit b915f79

Please sign in to comment.