MySQL版本信息和sql_mode
代码语言:javascript复制[sb1]> s
--------------
mysql Ver 8.0.29 for Linux on x86_64 (MySQL Community Server - GPL)
Connection id: 21
Current database: sb1
Current user: root@localhost
SSL: Not in use
Current pager: less
Using outfile: ''
Using delimiter: ;
Server version: 8.0.25-16 GreatSQL, Release 16, Revision 8bb0e5af297
Protocol version: 10
Connection: Localhost via UNIX socket
Insert id: 2
Server characterset: utf8mb4
Db characterset: utf8mb4
Client characterset: utf8mb4
Conn. characterset: utf8mb4
UNIX socket: /data/GreatSQL/mysql.sock
Uptime: 3 days 18 hours 26 min 4 sec
Threads: 1 Questions: 75 Slow queries: 0 Opens: 204 Flush tables: 3 Open tables: 120 Queries per second avg: 0.000
--------------
[sb1]> select @@sql_mode;
-----------------------------------------------------------------------------------------------------------------------
| @@sql_mode |
-----------------------------------------------------------------------------------------------------------------------
| ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION |
-----------------------------------------------------------------------------------------------------------------------
1 row in set (0.00 sec)
创建测试数据集
代码语言:javascript复制use sb1;
CREATE TABLE `t_utf8` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT '主键自增',
`code` varchar(255) NOT NULL COMMENT '代码',
`channel` varchar(255) NOT NULL COMMENT '渠道',
PRIMARY KEY (`id`),
KEY `idx_code` (`code`(10)) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 ;
CREATE TABLE `t_utf8mb4` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT '主键自增',
`code` varchar(255) NOT NULL COMMENT '代码',
`channel` varchar(255) NOT NULL COMMENT '渠道',
PRIMARY KEY (`id`),
KEY `idx_code` (`code`(10)) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 ;
INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );
INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );
INSERT INTO t_utf8mb4 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );
INSERT INTO t_utf8mb4 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );
查询测试
代码语言:javascript复制-- utf8 字符集 , 查询不符合预期
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8 where `code` = 'MM_WWWWW_AAA ';
---- --------------- ----------------- -------------- ------------------- ----------------------------
| id | code | c | length(code) | CHAR_LENGTH(code) | hex(code) |
---- --------------- ----------------- -------------- ------------------- ----------------------------
| 1 | MM_WWWWW_AAA | -MM_WWWWW_AAA - | 13 | 13 | 4D4D5F57575757575F41414120 |
| 2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 12 | 4D4D5F57575757575F414141 |
---- --------------- ----------------- -------------- ------------------- ----------------------------
2 rows in set (0.00 sec)
上面的这个查询 带空格的记录,查询结果把未带空格的记录也查出来了
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8 where `code` = 'MM_WWWWW_AAA';
---- --------------- ----------------- -------------- ------------------- ----------------------------
| id | code | c | length(code) | CHAR_LENGTH(code) | hex(code) |
---- --------------- ----------------- -------------- ------------------- ----------------------------
| 1 | MM_WWWWW_AAA | -MM_WWWWW_AAA - | 13 | 13 | 4D4D5F57575757575F41414120 |
| 2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 12 | 4D4D5F57575757575F414141 |
---- --------------- ----------------- -------------- ------------------- ----------------------------
2 rows in set (0.00 sec)
上面的这个查询 不带空格的记录,查询结果把带空格的记录也查出来了
select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8 where `code` = ' MM_WWWWW_AAA ';
Empty set (0.00 sec)
-- utf8mb4 字符集 , 查询符合预期
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8mb4 where `code` = 'MM_WWWWW_AAA ';
---- --------------- ----------------- -------------- ------------------- ----------------------------
| id | code | c | length(code) | CHAR_LENGTH(code) | hex(code) |
---- --------------- ----------------- -------------- ------------------- ----------------------------
| 1 | MM_WWWWW_AAA | -MM_WWWWW_AAA - | 13 | 13 | 4D4D5F57575757575F41414120 |
---- --------------- ----------------- -------------- ------------------- ----------------------------
1 row in set (0.00 sec)
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8mb4 where `code` = 'MM_WWWWW_AAA';
---- -------------- ---------------- -------------- ------------------- --------------------------
| id | code | c | length(code) | CHAR_LENGTH(code) | hex(code) |
---- -------------- ---------------- -------------- ------------------- --------------------------
| 2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 12 | 4D4D5F57575757575F414141 |
---- -------------- ---------------- -------------- ------------------- --------------------------
1 row in set (0.00 sec)
select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8mb4 where `code` = ' MM_WWWWW_AAA ';
Empty set (0.00 sec)
对此现象的解释
代码语言:javascript复制官方文档: https://dev.mysql.com/doc/refman/8.0/en/char.html
0 MySQL里面的排序规则的pad属性有2类:NO PAD 和 PAD SPACE。
1 NO PAD 排序规则将尾随空格视为比较中的重要字符,就像任何其他字符一样。(utf8mb4字符集的PAD_ATTRIBUTE值为NO PAD)
2 PAD SPACE 排序规则在比较中将尾随空格视为无关紧要,比较字符串时不考虑尾随空格。 (utf8字符集的PAD_ATTRIBUTE值为PAD SPACE)
3 更详细的请参见比较中的尾部空间处理https://dev.mysql.com/doc/refman/8.0/en/charset-binary-collations.html#charset-binary-collations-trailing-space-comparisons。
4 sql_mode的值对尾随空格的比较行为没有影响。
可以看下 utf8 和 utf8mb4 的 PAD_ATTRIBUTE 属性的区别:
[sb1]> select PAD_ATTRIBUTE,count(*) from INFORMATION_SCHEMA.COLLATIONS group by PAD_ATTRIBUTE;
--------------- ----------
| PAD_ATTRIBUTE | count(*) |
--------------- ----------
| PAD SPACE | 222 |
| NO PAD | 50 |
--------------- ----------
2 rows in set (0.00 sec)
[sb1]> select * from INFORMATION_SCHEMA.COLLATIONS where CHARACTER_SET_NAME='utf8mb4' and COLLATION_NAME like 'utf8mb4_0900_ai_ci%' ;
-------------------- -------------------- ----- ------------ ------------- --------- ---------------
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT | IS_COMPILED | SORTLEN | PAD_ATTRIBUTE |
-------------------- -------------------- ----- ------------ ------------- --------- ---------------
| utf8mb4_0900_ai_ci | utf8mb4 | 255 | Yes | Yes | 0 | NO PAD |
-------------------- -------------------- ----- ------------ ------------- --------- ---------------
1 row in set (0.00 sec)
[sb1]> select * from INFORMATION_SCHEMA.COLLATIONS where CHARACTER_SET_NAME='utf8' and COLLATION_NAME like '%general_ci%' ;
----------------- -------------------- ---- ------------ ------------- --------- ---------------
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT | IS_COMPILED | SORTLEN | PAD_ATTRIBUTE |
----------------- -------------------- ---- ------------ ------------- --------- ---------------
| utf8_general_ci | utf8 | 33 | Yes | Yes | 1 | PAD SPACE |
----------------- -------------------- ---- ------------ ------------- --------- ---------------
1 row in set (0.00 sec)
解决方法
代码语言:javascript复制1、utf8字符集下,改用二进制方式比较
[sb1]> select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8 where `code` = binary 'MM_WWWWW_AAA ';
---- --------------- ----------------- -------------- ----------------------------
| id | code | c | length(code) | hex(code) |
---- --------------- ----------------- -------------- ----------------------------
| 1 | MM_WWWWW_AAA | -MM_WWWWW_AAA - | 13 | 4D4D5F57575757575F41414120 |
---- --------------- ----------------- -------------- ----------------------------
1 row in set (0.00 sec)
[sb1]> select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8 where `code` = binary 'MM_WWWWW_AAA';
---- -------------- ---------------- -------------- --------------------------
| id | code | c | length(code) | hex(code) |
---- -------------- ---------------- -------------- --------------------------
| 2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 4D4D5F57575757575F414141 |
---- -------------- ---------------- -------------- --------------------------
1 row in set (0.01 sec)
2、转为utf8mb4字符集
ALTER TABLE xxx CONVERT TO CHARACTER SET utf8mb4 ;
注意:
对于那些删除了尾随字符或比较忽略了尾随字符的情况,如果列具有需要唯一值的索引,则在列中插入仅在尾随字符数量上不同的值将导致重复键错误。
例如,如果表包含'a',则尝试存储'a ' 会导致重复键错误。
[sb1]> INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );
[sb1]> INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );
[sb1]> alter table t_utf8 add unique key uniq_code(code);
ERROR 1062 (23000): Duplicate entry 'MM_WWWWW_AAA ' for key 't_utf8.uniq_code'
另外, 如果 是char类型,也会有意想不到的结果。如下:
代码语言:javascript复制use sb2;
CREATE TABLE `t_utf8` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT '主键自增',
`code` char(20) NOT NULL COMMENT '代码',
`channel` char(20) NOT NULL COMMENT '渠道',
PRIMARY KEY (`id`),
KEY `idx_code` (`code`(10)) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 ;
CREATE TABLE `t_utf8mb4` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT '主键自增',
`code` char(20) NOT NULL COMMENT '代码',
`channel` char(20) NOT NULL COMMENT '渠道',
PRIMARY KEY (`id`),
KEY `idx_code` (`code`(10)) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 ;
INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );
INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );
INSERT INTO t_utf8mb4 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );
INSERT INTO t_utf8mb4 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );
出现下面的查询结果的根本原因:char 在入库的时候 会把后面的空格去掉!
-- utf8 字符集
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8 where `code` = 'MM_WWWWW_AAA ';
---- -------------- ---------------- -------------- ------------------- --------------------------
| id | code | c | length(code) | CHAR_LENGTH(code) | hex(code) |
---- -------------- ---------------- -------------- ------------------- --------------------------
| 1 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 12 | 4D4D5F57575757575F414141 |
| 2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 12 | 4D4D5F57575757575F414141 |
---- -------------- ---------------- -------------- ------------------- --------------------------
2 rows in set (0.01 sec)
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8 where `code` = 'MM_WWWWW_AAA';
---- -------------- ---------------- -------------- ------------------- --------------------------
| id | code | c | length(code) | CHAR_LENGTH(code) | hex(code) |
---- -------------- ---------------- -------------- ------------------- --------------------------
| 1 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 12 | 4D4D5F57575757575F414141 |
| 2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 12 | 4D4D5F57575757575F414141 |
---- -------------- ---------------- -------------- ------------------- --------------------------
2 rows in set (0.00 sec)
select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8 where `code` = ' MM_WWWWW_AAA ';
Empty set (0.00 sec)
-- utf8mb4 字符集
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8mb4 where `code` = 'MM_WWWWW_AAA ';
Empty set (0.00 sec)
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8mb4 where `code` = 'MM_WWWWW_AAA';
---- -------------- ---------------- -------------- ------------------- --------------------------
| id | code | c | length(code) | CHAR_LENGTH(code) | hex(code) |
---- -------------- ---------------- -------------- ------------------- --------------------------
| 1 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 12 | 4D4D5F57575757575F414141 |
| 2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- | 12 | 12 | 4D4D5F57575757575F414141 |
---- -------------- ---------------- -------------- ------------------- --------------------------
2 rows in set (0.00 sec)
select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8mb4 where `code` = ' MM_WWWWW_AAA ';
Empty set (0.00 sec)