Skip to content

Commit

Permalink
Extra tests for MDEV-30716 Wrong casefolding in xxx_unicode_520_ci fo…
Browse files Browse the repository at this point in the history
…r U+0700..U+07FF

New tests display additional information about characters from the BMP range:

- A summary with a COUNT(*) for all distinct combinations of properties
  telling how the "=" and the "LIKE" predicates compare characters to their
  LOWER() and UPPER() variants.

- A detailed list of trciky characters
  for which the "=" and the "LIKE" predicates compare
  LOWER(c)/UPPER(c) variants as not equal to just "c".

Tricky characters include:
 - Turkish letters: ı - small dotless letter i
 - Croatian letters: precombined contractions for Dž, Dz, Lj, Nj
 - Units of measurement: Ω,K,Å (Ohm, Kelvin, Angstrom)
   These ones look very similar to Greek letter Omega,
   Latin letter Kra, Swedish/Finnish letter A with a ring above.
  • Loading branch information
abarkov committed Mar 7, 2024
1 parent 929c2e0 commit 9e7afa7
Show file tree
Hide file tree
Showing 18 changed files with 746 additions and 8 deletions.
47 changes: 47 additions & 0 deletions mysql-test/include/ctype_unicode_casefold_bmp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,58 @@ FROM

SELECT COLLATION(c) FROM v_bmp LIMIT 1;

--echo #
--echo # BMP character summary
--echo #

SELECT
BINARY(c)=BINARY(LOWER(c)) AS `Bc=BLc`,
BINARY(c)=BINARY(UPPER(c)) AS `Bc=BUc`,
c=LOWER(c) AS `c=L(c)`,
c=UPPER(c) AS `c=U(c)`,
c LIKE LOWER(c) AS `c~~L(c)`,
c LIKE UPPER(c) AS `c~~U(c)`,
COUNT(*),
IF(BINARY(c)=BINARY(LOWER(c)) AND BINARY(c)=BINARY(UPPER(c)),'',
LEFT(GROUP_CONCAT(c ORDER BY codepoint), 20)) AS example
FROM v_bmp
GROUP BY 1, 2, 3, 4, 5, 6;


--echo #
--echo # BMP characters with upper/lower mapping
--echo #

SELECT
codepoint_hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)),
HEX(CAST(UPPER(c) AS CHAR CHARACTER SET ucs2))
FROM v_bmp
WHERE BINARY(c)<>BINARY(LOWER(c)) OR BINARY(c)<>BINARY(UPPER(c));

--echo #
--echo # BMP characters with a non-trivial upper/lower mapping
--echo #

SELECT
codepoint_hex4 as hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_l,
HEX(CAST(UPPER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_u,
c=LOWER(c) AS `c=L`,
c=UPPER(c) AS `c=U`,
c LIKE LOWER(c) AS `c~~L`,
c LIKE UPPER(c) AS `c~~U`,
c,
LOWER(c) AS `L(c)`,
UPPER(c) AS `U(c)`
FROM v_bmp
WHERE NOT (
(BINARY(c)=BINARY(LOWER(c)) OR BINARY(c)=BINARY(UPPER(c))) AND
c = LOWER(c) AND
c = UPPER(c) AND
c LIKE UPPER(c) AND
c LIKE LOWER(c)
);


DROP VIEW v_bmp;
56 changes: 55 additions & 1 deletion mysql-test/main/ctype_ucs2_general_ci_casefold.result
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# MDEV-30716 Wrong casefolding in xxx_unicode_520_ci for U+0700..U+07FF
#
SET collation_connection=ucs2_general_ci;
SET collation_connection=ucs2_general_ci, @@character_set_results=utf8mb3;
EXECUTE IMMEDIATE SFORMAT('
CREATE VIEW v_bmp AS
SELECT
Expand All @@ -16,6 +16,30 @@ FROM
SELECT COLLATION(c) FROM v_bmp LIMIT 1;
COLLATION(c)
ucs2_general_ci
#
# BMP character summary
#
SELECT
BINARY(c)=BINARY(LOWER(c)) AS `Bc=BLc`,
BINARY(c)=BINARY(UPPER(c)) AS `Bc=BUc`,
c=LOWER(c) AS `c=L(c)`,
c=UPPER(c) AS `c=U(c)`,
c LIKE LOWER(c) AS `c~~L(c)`,
c LIKE UPPER(c) AS `c~~U(c)`,
COUNT(*),
IF(BINARY(c)=BINARY(LOWER(c)) AND BINARY(c)=BINARY(UPPER(c)),'',
LEFT(GROUP_CONCAT(c ORDER BY codepoint), 20)) AS example
FROM v_bmp
GROUP BY 1, 2, 3, 4, 5, 6;
Bc=BLc Bc=BUc c=L(c) c=U(c) c~~L(c) c~~U(c) COUNT(*) example
0 0 1 1 1 1 4 Dž,Lj,Nj,Dz
0 1 0 1 0 1 3 Ω,K,Å
0 1 1 1 1 1 689 A,B,C,D,E,F,G,H,I,J,
1 0 1 1 1 1 702 a,b,c,d,e,f,g,h,i,j,
1 1 1 1 1 1 64138
#
# BMP characters with upper/lower mapping
#
SELECT
codepoint_hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)),
Expand Down Expand Up @@ -1421,6 +1445,36 @@ FF57 FF57 FF37
FF58 FF58 FF38
FF59 FF59 FF39
FF5A FF5A FF3A
#
# BMP characters with a non-trivial upper/lower mapping
#
SELECT
codepoint_hex4 as hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_l,
HEX(CAST(UPPER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_u,
c=LOWER(c) AS `c=L`,
c=UPPER(c) AS `c=U`,
c LIKE LOWER(c) AS `c~~L`,
c LIKE UPPER(c) AS `c~~U`,
c,
LOWER(c) AS `L(c)`,
UPPER(c) AS `U(c)`
FROM v_bmp
WHERE NOT (
(BINARY(c)=BINARY(LOWER(c)) OR BINARY(c)=BINARY(UPPER(c))) AND
c = LOWER(c) AND
c = UPPER(c) AND
c LIKE UPPER(c) AND
c LIKE LOWER(c)
);
hex4 hex4_l hex4_u c=L c=U c~~L c~~U c L(c) U(c)
01C5 01C6 01C4 1 1 1 1 Dž dž DŽ
01C8 01C9 01C7 1 1 1 1 Lj lj LJ
01CB 01CC 01CA 1 1 1 1 Nj nj NJ
01F2 01F3 01F1 1 1 1 1 Dz dz DZ
2126 03C9 2126 0 1 0 1 Ω ω Ω
212A 006B 212A 0 1 0 1 K k K
212B 00E5 212B 0 1 0 1 Å å Å
DROP VIEW v_bmp;
#
# End of 10.7 tests
Expand Down
2 changes: 1 addition & 1 deletion mysql-test/main/ctype_ucs2_general_ci_casefold.test
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
--echo # MDEV-30716 Wrong casefolding in xxx_unicode_520_ci for U+0700..U+07FF
--echo #

SET collation_connection=ucs2_general_ci;
SET collation_connection=ucs2_general_ci, @@character_set_results=utf8mb3;
--source include/ctype_unicode_casefold_bmp.inc

--echo #
Expand Down
56 changes: 55 additions & 1 deletion mysql-test/main/ctype_ucs2_general_mysql500_ci_casefold.result
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# MDEV-30716 Wrong casefolding in xxx_unicode_520_ci for U+0700..U+07FF
#
SET @@collation_connection=ucs2_general_mysql500_ci;
SET @@collation_connection=ucs2_general_mysql500_ci, @@character_set_results=utf8mb3;
EXECUTE IMMEDIATE SFORMAT('
CREATE VIEW v_bmp AS
SELECT
Expand All @@ -16,6 +16,30 @@ FROM
SELECT COLLATION(c) FROM v_bmp LIMIT 1;
COLLATION(c)
ucs2_general_mysql500_ci
#
# BMP character summary
#
SELECT
BINARY(c)=BINARY(LOWER(c)) AS `Bc=BLc`,
BINARY(c)=BINARY(UPPER(c)) AS `Bc=BUc`,
c=LOWER(c) AS `c=L(c)`,
c=UPPER(c) AS `c=U(c)`,
c LIKE LOWER(c) AS `c~~L(c)`,
c LIKE UPPER(c) AS `c~~U(c)`,
COUNT(*),
IF(BINARY(c)=BINARY(LOWER(c)) AND BINARY(c)=BINARY(UPPER(c)),'',
LEFT(GROUP_CONCAT(c ORDER BY codepoint), 20)) AS example
FROM v_bmp
GROUP BY 1, 2, 3, 4, 5, 6;
Bc=BLc Bc=BUc c=L(c) c=U(c) c~~L(c) c~~U(c) COUNT(*) example
0 0 1 1 1 1 4 Dž,Lj,Nj,Dz
0 1 0 1 0 1 3 Ω,K,Å
0 1 1 1 1 1 689 A,B,C,D,E,F,G,H,I,J,
1 0 1 1 1 1 702 a,b,c,d,e,f,g,h,i,j,
1 1 1 1 1 1 64138
#
# BMP characters with upper/lower mapping
#
SELECT
codepoint_hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)),
Expand Down Expand Up @@ -1421,6 +1445,36 @@ FF57 FF57 FF37
FF58 FF58 FF38
FF59 FF59 FF39
FF5A FF5A FF3A
#
# BMP characters with a non-trivial upper/lower mapping
#
SELECT
codepoint_hex4 as hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_l,
HEX(CAST(UPPER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_u,
c=LOWER(c) AS `c=L`,
c=UPPER(c) AS `c=U`,
c LIKE LOWER(c) AS `c~~L`,
c LIKE UPPER(c) AS `c~~U`,
c,
LOWER(c) AS `L(c)`,
UPPER(c) AS `U(c)`
FROM v_bmp
WHERE NOT (
(BINARY(c)=BINARY(LOWER(c)) OR BINARY(c)=BINARY(UPPER(c))) AND
c = LOWER(c) AND
c = UPPER(c) AND
c LIKE UPPER(c) AND
c LIKE LOWER(c)
);
hex4 hex4_l hex4_u c=L c=U c~~L c~~U c L(c) U(c)
01C5 01C6 01C4 1 1 1 1 Dž dž DŽ
01C8 01C9 01C7 1 1 1 1 Lj lj LJ
01CB 01CC 01CA 1 1 1 1 Nj nj NJ
01F2 01F3 01F1 1 1 1 1 Dz dz DZ
2126 03C9 2126 0 1 0 1 Ω ω Ω
212A 006B 212A 0 1 0 1 K k K
212B 00E5 212B 0 1 0 1 Å å Å
DROP VIEW v_bmp;
#
# End of 10.7 tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
--echo # MDEV-30716 Wrong casefolding in xxx_unicode_520_ci for U+0700..U+07FF
--echo #

SET @@collation_connection=ucs2_general_mysql500_ci;
SET @@collation_connection=ucs2_general_mysql500_ci, @@character_set_results=utf8mb3;
--source include/ctype_unicode_casefold_bmp.inc


Expand Down
54 changes: 53 additions & 1 deletion mysql-test/main/ctype_ucs2_turkish_ci_casefold.result
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# MDEV-30716 Wrong casefolding in xxx_unicode_520_ci for U+0700..U+07FF
#
SET @@collation_connection=ucs2_turkish_ci;
SET @@collation_connection=ucs2_turkish_ci, @@character_set_results=utf8mb3;
EXECUTE IMMEDIATE SFORMAT('
CREATE VIEW v_bmp AS
SELECT
Expand All @@ -16,6 +16,30 @@ FROM
SELECT COLLATION(c) FROM v_bmp LIMIT 1;
COLLATION(c)
ucs2_turkish_ci
#
# BMP character summary
#
SELECT
BINARY(c)=BINARY(LOWER(c)) AS `Bc=BLc`,
BINARY(c)=BINARY(UPPER(c)) AS `Bc=BUc`,
c=LOWER(c) AS `c=L(c)`,
c=UPPER(c) AS `c=U(c)`,
c LIKE LOWER(c) AS `c~~L(c)`,
c LIKE UPPER(c) AS `c~~U(c)`,
COUNT(*),
IF(BINARY(c)=BINARY(LOWER(c)) AND BINARY(c)=BINARY(UPPER(c)),'',
LEFT(GROUP_CONCAT(c ORDER BY codepoint), 20)) AS example
FROM v_bmp
GROUP BY 1, 2, 3, 4, 5, 6;
Bc=BLc Bc=BUc c=L(c) c=U(c) c~~L(c) c~~U(c) COUNT(*) example
0 0 1 1 1 1 4 Dž,Lj,Nj,Dz
0 1 1 1 1 1 692 A,B,C,D,E,F,G,H,I,J,
1 0 1 0 1 0 1 ͅ
1 0 1 1 1 1 701 a,b,c,d,e,f,g,h,i,j,
1 1 1 1 1 1 64138
#
# BMP characters with upper/lower mapping
#
SELECT
codepoint_hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)),
Expand Down Expand Up @@ -1421,6 +1445,34 @@ FF57 FF57 FF37
FF58 FF58 FF38
FF59 FF59 FF39
FF5A FF5A FF3A
#
# BMP characters with a non-trivial upper/lower mapping
#
SELECT
codepoint_hex4 as hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_l,
HEX(CAST(UPPER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_u,
c=LOWER(c) AS `c=L`,
c=UPPER(c) AS `c=U`,
c LIKE LOWER(c) AS `c~~L`,
c LIKE UPPER(c) AS `c~~U`,
c,
LOWER(c) AS `L(c)`,
UPPER(c) AS `U(c)`
FROM v_bmp
WHERE NOT (
(BINARY(c)=BINARY(LOWER(c)) OR BINARY(c)=BINARY(UPPER(c))) AND
c = LOWER(c) AND
c = UPPER(c) AND
c LIKE UPPER(c) AND
c LIKE LOWER(c)
);
hex4 hex4_l hex4_u c=L c=U c~~L c~~U c L(c) U(c)
01C5 01C6 01C4 1 1 1 1 Dž dž DŽ
01C8 01C9 01C7 1 1 1 1 Lj lj LJ
01CB 01CC 01CA 1 1 1 1 Nj nj NJ
01F2 01F3 01F1 1 1 1 1 Dz dz DZ
0345 0345 0399 1 0 1 0 ͅ ͅ Ι
DROP VIEW v_bmp;
#
# End of 10.7 tests
Expand Down
2 changes: 1 addition & 1 deletion mysql-test/main/ctype_ucs2_turkish_ci_casefold.test
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
--echo # MDEV-30716 Wrong casefolding in xxx_unicode_520_ci for U+0700..U+07FF
--echo #

SET @@collation_connection=ucs2_turkish_ci;
SET @@collation_connection=ucs2_turkish_ci, @@character_set_results=utf8mb3;
--source include/ctype_unicode_casefold_bmp.inc

--echo #
Expand Down
55 changes: 54 additions & 1 deletion mysql-test/main/ctype_ucs2_unicode_520_ci_casefold.result
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# MDEV-30716 Wrong casefolding in xxx_unicode_520_ci for U+0700..U+07FF
#
SET @@collation_connection=ucs2_unicode_520_ci;
SET @@collation_connection=ucs2_unicode_520_ci, @@character_set_results=utf8mb3;
EXECUTE IMMEDIATE SFORMAT('
CREATE VIEW v_bmp AS
SELECT
Expand All @@ -16,6 +16,30 @@ FROM
SELECT COLLATION(c) FROM v_bmp LIMIT 1;
COLLATION(c)
ucs2_unicode_520_ci
#
# BMP character summary
#
SELECT
BINARY(c)=BINARY(LOWER(c)) AS `Bc=BLc`,
BINARY(c)=BINARY(UPPER(c)) AS `Bc=BUc`,
c=LOWER(c) AS `c=L(c)`,
c=UPPER(c) AS `c=U(c)`,
c LIKE LOWER(c) AS `c~~L(c)`,
c LIKE UPPER(c) AS `c~~U(c)`,
COUNT(*),
IF(BINARY(c)=BINARY(LOWER(c)) AND BINARY(c)=BINARY(UPPER(c)),'',
LEFT(GROUP_CONCAT(c ORDER BY codepoint), 20)) AS example
FROM v_bmp
GROUP BY 1, 2, 3, 4, 5, 6;
Bc=BLc Bc=BUc c=L(c) c=U(c) c~~L(c) c~~U(c) COUNT(*) example
0 0 1 1 1 1 4 Dž,Lj,Nj,Dz
0 1 1 1 1 1 985 A,B,C,D,E,F,G,H,I,J,
1 0 1 0 1 0 2 ı,ͅ
1 0 1 1 1 1 991 a,b,c,d,e,f,g,h,i,j,
1 1 1 1 1 1 63554
#
# BMP characters with upper/lower mapping
#
SELECT
codepoint_hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)),
Expand Down Expand Up @@ -2005,6 +2029,35 @@ FF57 FF57 FF37
FF58 FF58 FF38
FF59 FF59 FF39
FF5A FF5A FF3A
#
# BMP characters with a non-trivial upper/lower mapping
#
SELECT
codepoint_hex4 as hex4,
HEX(CAST(LOWER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_l,
HEX(CAST(UPPER(c) AS CHAR CHARACTER SET ucs2)) AS hex4_u,
c=LOWER(c) AS `c=L`,
c=UPPER(c) AS `c=U`,
c LIKE LOWER(c) AS `c~~L`,
c LIKE UPPER(c) AS `c~~U`,
c,
LOWER(c) AS `L(c)`,
UPPER(c) AS `U(c)`
FROM v_bmp
WHERE NOT (
(BINARY(c)=BINARY(LOWER(c)) OR BINARY(c)=BINARY(UPPER(c))) AND
c = LOWER(c) AND
c = UPPER(c) AND
c LIKE UPPER(c) AND
c LIKE LOWER(c)
);
hex4 hex4_l hex4_u c=L c=U c~~L c~~U c L(c) U(c)
0131 0131 0049 1 0 1 0 ı ı I
01C5 01C6 01C4 1 1 1 1 Dž dž DŽ
01C8 01C9 01C7 1 1 1 1 Lj lj LJ
01CB 01CC 01CA 1 1 1 1 Nj nj NJ
01F2 01F3 01F1 1 1 1 1 Dz dz DZ
0345 0345 0399 1 0 1 0 ͅ ͅ Ι
DROP VIEW v_bmp;
#
# End of 10.7 tests
Expand Down
2 changes: 1 addition & 1 deletion mysql-test/main/ctype_ucs2_unicode_520_ci_casefold.test
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
--echo # MDEV-30716 Wrong casefolding in xxx_unicode_520_ci for U+0700..U+07FF
--echo #

SET @@collation_connection=ucs2_unicode_520_ci;
SET @@collation_connection=ucs2_unicode_520_ci, @@character_set_results=utf8mb3;
--source include/ctype_unicode_casefold_bmp.inc

--echo #
Expand Down

0 comments on commit 9e7afa7

Please sign in to comment.