varchar在字符集不同情况下导致的查询异常

2023-03-16 15:55:47 浏览数 (2)

MySQL版本信息和sql_mode

代码语言:javascript复制
[sb1]> s
--------------
mysql  Ver 8.0.29 for Linux on x86_64 (MySQL Community Server - GPL)

Connection id:    21
Current database:  sb1
Current user:    root@localhost
SSL:      Not in use
Current pager:    less
Using outfile:    ''
Using delimiter:  ;
Server version:    8.0.25-16 GreatSQL, Release 16, Revision 8bb0e5af297
Protocol version:  10
Connection:    Localhost via UNIX socket
Insert id:    2
Server characterset:  utf8mb4
Db     characterset:  utf8mb4
Client characterset:  utf8mb4
Conn.  characterset:  utf8mb4
UNIX socket:    /data/GreatSQL/mysql.sock
Uptime:      3 days 18 hours 26 min 4 sec

Threads: 1  Questions: 75  Slow queries: 0  Opens: 204  Flush tables: 3  Open tables: 120  Queries per second avg: 0.000
--------------


[sb1]> select @@sql_mode;
 ----------------------------------------------------------------------------------------------------------------------- 
| @@sql_mode                                                                                                            |
 ----------------------------------------------------------------------------------------------------------------------- 
| ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION |
 ----------------------------------------------------------------------------------------------------------------------- 
1 row in set (0.00 sec)

创建测试数据集

代码语言:javascript复制
use sb1;

CREATE TABLE `t_utf8` (
  `id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT '主键自增',
  `code` varchar(255) NOT NULL COMMENT '代码',
  `channel` varchar(255) NOT NULL COMMENT '渠道',
  PRIMARY KEY (`id`),
  KEY `idx_code` (`code`(10)) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 ;

CREATE TABLE `t_utf8mb4` (
  `id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT '主键自增',
  `code` varchar(255) NOT NULL COMMENT '代码',
  `channel` varchar(255) NOT NULL COMMENT '渠道',
  PRIMARY KEY (`id`),
  KEY `idx_code` (`code`(10)) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 ;



INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );
INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );

INSERT INTO t_utf8mb4 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );
INSERT INTO t_utf8mb4 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );

查询测试

代码语言:javascript复制
-- utf8 字符集  , 查询不符合预期
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8 where `code` = 'MM_WWWWW_AAA ';  
 ---- --------------- ----------------- -------------- ------------------- ---------------------------- 
| id | code          | c               | length(code) | CHAR_LENGTH(code) | hex(code)                  |
 ---- --------------- ----------------- -------------- ------------------- ---------------------------- 
|  1 | MM_WWWWW_AAA  | -MM_WWWWW_AAA - |           13 |                13 | 4D4D5F57575757575F41414120 |
|  2 | MM_WWWWW_AAA  | -MM_WWWWW_AAA-  |           12 |                12 | 4D4D5F57575757575F414141   |
 ---- --------------- ----------------- -------------- ------------------- ---------------------------- 
2 rows in set (0.00 sec)
上面的这个查询 带空格的记录,查询结果把未带空格的记录也查出来了


select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8 where `code` = 'MM_WWWWW_AAA'; 
 ---- --------------- ----------------- -------------- ------------------- ---------------------------- 
| id | code          | c               | length(code) | CHAR_LENGTH(code) | hex(code)                  |
 ---- --------------- ----------------- -------------- ------------------- ---------------------------- 
|  1 | MM_WWWWW_AAA  | -MM_WWWWW_AAA - |           13 |                13 | 4D4D5F57575757575F41414120 |
|  2 | MM_WWWWW_AAA  | -MM_WWWWW_AAA-  |           12 |                12 | 4D4D5F57575757575F414141   |
 ---- --------------- ----------------- -------------- ------------------- ---------------------------- 
2 rows in set (0.00 sec)
上面的这个查询 不带空格的记录,查询结果把带空格的记录也查出来了

select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8 where `code` = ' MM_WWWWW_AAA ';
Empty set (0.00 sec)



-- utf8mb4 字符集 , 查询符合预期
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8mb4 where `code` = 'MM_WWWWW_AAA ';
 ---- --------------- ----------------- -------------- ------------------- ---------------------------- 
| id | code          | c               | length(code) | CHAR_LENGTH(code) | hex(code)                  |
 ---- --------------- ----------------- -------------- ------------------- ---------------------------- 
|  1 | MM_WWWWW_AAA  | -MM_WWWWW_AAA - |           13 |                13 | 4D4D5F57575757575F41414120 |
 ---- --------------- ----------------- -------------- ------------------- ---------------------------- 
1 row in set (0.00 sec)



select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8mb4 where `code` = 'MM_WWWWW_AAA';
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
| id | code         | c              | length(code) | CHAR_LENGTH(code) | hex(code)                |
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
|  2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- |           12 |                12 | 4D4D5F57575757575F414141 |
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
1 row in set (0.00 sec)



select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8mb4 where `code` = ' MM_WWWWW_AAA ';
Empty set (0.00 sec)

对此现象的解释

代码语言:javascript复制
官方文档: https://dev.mysql.com/doc/refman/8.0/en/char.html
 
0 MySQL里面的排序规则的pad属性有2类:NO PAD 和 PAD SPACE。
1 NO PAD 排序规则将尾随空格视为比较中的重要字符,就像任何其他字符一样。(utf8mb4字符集的PAD_ATTRIBUTE值为NO PAD)
2 PAD SPACE 排序规则在比较中将尾随空格视为无关紧要,比较字符串时不考虑尾随空格。 (utf8字符集的PAD_ATTRIBUTE值为PAD SPACE)
3 更详细的请参见比较中的尾部空间处理https://dev.mysql.com/doc/refman/8.0/en/charset-binary-collations.html#charset-binary-collations-trailing-space-comparisons。
4 sql_mode的值对尾随空格的比较行为没有影响。

  可以看下 utf8 和 utf8mb4 的 PAD_ATTRIBUTE 属性的区别:
      [sb1]> select PAD_ATTRIBUTE,count(*) from INFORMATION_SCHEMA.COLLATIONS group by  PAD_ATTRIBUTE;
         --------------- ---------- 
        | PAD_ATTRIBUTE | count(*) |
         --------------- ---------- 
        | PAD SPACE     |      222 |
        | NO PAD        |       50 |
         --------------- ---------- 
        2 rows in set (0.00 sec)

        [sb1]> select * from INFORMATION_SCHEMA.COLLATIONS where CHARACTER_SET_NAME='utf8mb4' and COLLATION_NAME like 'utf8mb4_0900_ai_ci%' ;
         -------------------- -------------------- ----- ------------ ------------- --------- --------------- 
        | COLLATION_NAME     | CHARACTER_SET_NAME | ID  | IS_DEFAULT | IS_COMPILED | SORTLEN | PAD_ATTRIBUTE |
         -------------------- -------------------- ----- ------------ ------------- --------- --------------- 
        | utf8mb4_0900_ai_ci | utf8mb4            | 255 | Yes        | Yes         |       0 | NO PAD        |
         -------------------- -------------------- ----- ------------ ------------- --------- --------------- 
        1 row in set (0.00 sec)

        [sb1]> select * from INFORMATION_SCHEMA.COLLATIONS where CHARACTER_SET_NAME='utf8' and COLLATION_NAME like '%general_ci%' ;
         ----------------- -------------------- ---- ------------ ------------- --------- --------------- 
        | COLLATION_NAME  | CHARACTER_SET_NAME | ID | IS_DEFAULT | IS_COMPILED | SORTLEN | PAD_ATTRIBUTE |
         ----------------- -------------------- ---- ------------ ------------- --------- --------------- 
        | utf8_general_ci | utf8               | 33 | Yes        | Yes         |       1 | PAD SPACE     |
         ----------------- -------------------- ---- ------------ ------------- --------- --------------- 
        1 row in set (0.00 sec)

解决方法

代码语言:javascript复制
1、utf8字符集下,改用二进制方式比较
    [sb1]> select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8 where `code` = binary 'MM_WWWWW_AAA ';
     ---- --------------- ----------------- -------------- ---------------------------- 
    | id | code          | c               | length(code) | hex(code)                  |
     ---- --------------- ----------------- -------------- ---------------------------- 
    |  1 | MM_WWWWW_AAA  | -MM_WWWWW_AAA - |           13 | 4D4D5F57575757575F41414120 |
     ---- --------------- ----------------- -------------- ---------------------------- 
    1 row in set (0.00 sec)
    [sb1]> select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8 where `code` = binary 'MM_WWWWW_AAA';
     ---- -------------- ---------------- -------------- -------------------------- 
    | id | code         | c              | length(code) | hex(code)                |
     ---- -------------- ---------------- -------------- -------------------------- 
    |  2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- |           12 | 4D4D5F57575757575F414141 |
     ---- -------------- ---------------- -------------- -------------------------- 
    1 row in set (0.01 sec)


2、转为utf8mb4字符集
    ALTER TABLE xxx CONVERT TO CHARACTER SET utf8mb4 ;

注意:

对于那些删除了尾随字符或比较忽略了尾随字符的情况,如果列具有需要唯一值的索引,则在列中插入仅在尾随字符数量上不同的值将导致重复键错误。

       例如,如果表包含'a',则尝试存储'a ' 会导致重复键错误。

           [sb1]> INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );

           [sb1]> INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );

           [sb1]> alter table t_utf8 add unique key uniq_code(code);

           ERROR 1062 (23000): Duplicate entry 'MM_WWWWW_AAA ' for key 't_utf8.uniq_code'

另外, 如果 是char类型,也会有意想不到的结果。如下:

代码语言:javascript复制
use sb2;

CREATE TABLE `t_utf8` (
  `id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT '主键自增',
  `code` char(20)  NOT NULL COMMENT '代码',
  `channel` char(20)  NOT NULL COMMENT '渠道',
  PRIMARY KEY (`id`),
  KEY `idx_code` (`code`(10)) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 ;

CREATE TABLE `t_utf8mb4` (
  `id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT '主键自增',
  `code` char(20)  NOT NULL COMMENT '代码',
  `channel` char(20)  NOT NULL COMMENT '渠道',
  PRIMARY KEY (`id`),
  KEY `idx_code` (`code`(10)) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 ;



INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );
INSERT INTO t_utf8 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );

INSERT INTO t_utf8mb4 (`code`,channel ) VALUES ('MM_WWWWW_AAA ','aaaa' );
INSERT INTO t_utf8mb4 (`code`,channel ) VALUES ('MM_WWWWW_AAA','aaaa' );



出现下面的查询结果的根本原因:char 在入库的时候 会把后面的空格去掉!

-- utf8 字符集 
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8 where `code` = 'MM_WWWWW_AAA ';
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
| id | code         | c              | length(code) | CHAR_LENGTH(code) | hex(code)                |
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
|  1 | MM_WWWWW_AAA | -MM_WWWWW_AAA- |           12 |                12 | 4D4D5F57575757575F414141 |
|  2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- |           12 |                12 | 4D4D5F57575757575F414141 |
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
2 rows in set (0.01 sec)

select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8 where `code` = 'MM_WWWWW_AAA';
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
| id | code         | c              | length(code) | CHAR_LENGTH(code) | hex(code)                |
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
|  1 | MM_WWWWW_AAA | -MM_WWWWW_AAA- |           12 |                12 | 4D4D5F57575757575F414141 |
|  2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- |           12 |                12 | 4D4D5F57575757575F414141 |
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
2 rows in set (0.00 sec)

select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8 where `code` = ' MM_WWWWW_AAA ';
Empty set (0.00 sec)


-- utf8mb4 字符集
select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8mb4 where `code` = 'MM_WWWWW_AAA ';
Empty set (0.00 sec)

select id,code,concat('-',code,'-') as c,length(code),CHAR_LENGTH(code),hex(code) from t_utf8mb4 where `code` = 'MM_WWWWW_AAA';
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
| id | code         | c              | length(code) | CHAR_LENGTH(code) | hex(code)                |
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
|  1 | MM_WWWWW_AAA | -MM_WWWWW_AAA- |           12 |                12 | 4D4D5F57575757575F414141 |
|  2 | MM_WWWWW_AAA | -MM_WWWWW_AAA- |           12 |                12 | 4D4D5F57575757575F414141 |
 ---- -------------- ---------------- -------------- ------------------- -------------------------- 
2 rows in set (0.00 sec)

select id,code,concat('-',code,'-') as c,length(code),hex(code) from t_utf8mb4 where `code` = ' MM_WWWWW_AAA ';
Empty set (0.00 sec)

0 人点赞