Postgre物理Page分析

2022-08-17 12:31:37 浏览数 (1)

pageinspect分析工具

  • 编译安装postgres extension
代码语言:javascript复制
// 安装 postgres extension
[root@centos-linux ~]$ mkdir /usr/local/pgsql &&  chown -R perrynzhou:perrynzhou /usr/local/pgsql && chmod -R 755 /usr/local/pgsql

[perrynzhou@centos-linux ~]$cd /home/perrynzhou/postgresql-14rc1 && env CFLAGS="-g -O0" ./configure
[perrynzhou@centos-linux ~]$c cd /home/perrynzhou/postgresql-14rc1/contrib &&  make && make install


// 安装完毕后的extension
[perrynzhou@centos-linux ~]$ ls -l /usr/local/pgsql/bin/
total 192
-rwxr-xr-x. 1 perrynzhou perrynzhou 95896 Nov 17 00:06 oid2name
-rwxr-xr-x. 1 perrynzhou perrynzhou 94632 Nov 17 00:06 vacuumlo
[perrynzhou@centos-linux ~]$ ls -l /usr/local/pgsql/lib/
total 4952
-rwxr-xr-x. 1 perrynzhou perrynzhou  70448 Nov 17 00:06 adminpack.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 173784 Nov 17 00:06 amcheck.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  24072 Nov 17 00:06 auth_delay.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  63152 Nov 17 00:06 auto_explain.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36720 Nov 17 00:06 autoinc.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 195416 Nov 17 00:06 bloom.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  79536 Nov 17 00:06 btree_gin.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 619560 Nov 17 00:06 btree_gist.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36792 Nov 17 00:06 citext.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 120040 Nov 17 00:06 cube.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 134000 Nov 17 00:06 dblink.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  28240 Nov 17 00:06 dict_int.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  31208 Nov 17 00:06 dict_xsyn.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  12208 Nov 17 00:06 earthdistance.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 100744 Nov 17 00:06 file_fdw.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  61136 Nov 17 00:06 fuzzystrmatch.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 229144 Nov 17 00:06 hstore.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36376 Nov 17 00:06 insert_username.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 170080 Nov 17 00:06 _int.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 130848 Nov 17 00:06 isn.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36168 Nov 17 00:06 lo.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 192416 Nov 17 00:06 ltree.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  36416 Nov 17 00:06 moddatetime.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  46952 Nov 17 00:06 old_snapshot.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 269768 Nov 17 00:06 pageinspect.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  16008 Nov 17 00:06 passwordcheck.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  47944 Nov 17 00:06 pg_buffercache.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 453664 Nov 17 00:06 pgcrypto.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  22744 Nov 17 00:06 pg_freespacemap.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  78784 Nov 17 00:06 pg_prewarm.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  70200 Nov 17 00:06 pgrowlocks.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 116472 Nov 17 00:06 pg_stat_statements.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 153928 Nov 17 00:06 pgstattuple.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  50928 Nov 17 00:06 pg_surgery.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 142808 Nov 17 00:06 pg_trgm.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  73536 Nov 17 00:06 pg_visibility.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 376072 Nov 17 00:06 postgres_fdw.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  49984 Nov 17 00:06 refint.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  82760 Nov 17 00:06 seg.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  74720 Nov 17 00:06 tablefunc.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  38896 Nov 17 00:06 tcn.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  68280 Nov 17 00:06 test_decoding.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  55976 Nov 17 00:06 tsm_system_rows.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  56520 Nov 17 00:06 tsm_system_time.so
-rwxr-xr-x. 1 perrynzhou perrynzhou  40912 Nov 17 00:06 unaccent.so
  • 启动数据库并加载
代码语言:javascript复制
// 启动需要取消auto vacuum
[perrynzhou@centos-linux ~]$ pg_ctl -D /postgres/data -l logfile stop
[perrynzhou@centos-linux ~]$ pg_ctl -D /postgres/data -l logfile start
[perrynzhou@centos-linux ~]$ psql -h 127.0.0.1 -d sampledb

// 加载extension
sampledb=# create extension pageinspect;
CREATE EXTENSION

// 创建测试表
sampledb=# create table happy(id int,name varchar);
CREATE TABLE

物理也分析

  • 表的隐藏列的含义和分析,PG一个表包含了tableoid/cmax/xmax/cmin/xmin/ctid这几个隐藏列,其中tableoid是表文件的唯一标识,xmin代表当向表中插入一行数据的的事务ID;xmax字段如果值为0,标识这一行的数据没有被删除,如果表中的这行数据被删除,xmax的值就是执行这行数据删除的事务ID;cmin是插入事务内命令行标识;cmax是删除事务内命令行标识;这些标识用于MVCC中判断row是否对于其他事务可见
代码语言:javascript复制
sampledb=# d happy
                    Table "public.happy"
 Column |       Type        | Collation | Nullable | Default 
-------- ------------------- ----------- ---------- ---------
 id     | integer           |           |          | 
 name   | character varying |           |          | 

sampledb=# begin;
BEGIN

// 查看表的隐藏列
sampledb=# select  attname, format_type (atttypid, atttypmod) from  pg_attribute as a,pg_class as b  where a.attrelid = b.oid and b.relname='happy';
 attname  |    format_type    
---------- -------------------
 tableoid | oid
 cmax     | cid
 xmax     | xid
 cmin     | cid
 xmin     | xid
 ctid     | tid
 id       | integer
 name     | character varying
 
 // 开始插入一条数据,xmin是执行插入语句事务的ID,xmax是为0
 
sampledb=*# insert into happy  select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
sampledb=*# select xmin,xmax,cmin,cmax,* from  happy;
 xmin | xmax | cmin | cmax |   id   |               name               
------ ------ ------ ------ -------- ----------------------------------
 5823 |    0 |    0 |    0 | 739849 | 26297c7c1d030e111601a1ce070eae5f
(1 row)

// 删除这条数据,然后回滚观察xmax的值

sampledb=*# delete from happy;
DELETE 1
sampledb=*# rollback;
ROLLBACK

sampledb=# select xmin,xmax,cmin,cmax,* from  happy;
 xmin | xmax | cmin | cmax |  id   |               name               
------ ------ ------ ------ ------- ----------------------------------
 5828 | 5829 |    0 |    0 | 52794 | 616bccad918d0b2de733f544bc4a6eb7
  • 表中插入数据,通过pageinspect分析可以看出t_xmin和t_xmax,t_xmin对应的是隐藏字段xmin,t_xmax对应是隐藏字段xmax。insert数据后,t_xmin为当前插入事务的ID。该记录没有任何的更新t_xmax也是为0
代码语言:javascript复制
// 开启一个事务
sampledb=# begin;
BEGIN

// 插入三条记录
sampledb=*# insert into happy  select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
sampledb=*# insert into happy  select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
sampledb=*# insert into happy  select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1

// 通过工具分析该表的物理page,可以看出可以把数据打印出来
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
-------- -------- -----------------------------------------------------------------------------------------
   5831 |      0 | {"\x26f80300","\x436565633366636139643261356639386266646434343833623235386334306266"}
   5831 |      0 | {"\x865d0100","\x433262663962343761376665643665343363333033373837313730333263396261"}
   5831 |      0 | {"\x6dc10300","\x433963306533616439363931316362326261386332333031663530333339313037"}
(3 rows)
  • 表中更新三条记录,通过pageinspect工具分析看出PG中的表更新是先把原来的记录标记为删除(xmin=xmax设置为事务5831),接着插入三条记录,从新设置这三条记录xmin和xmax(xmin=当前事务ID5831,xmax=0).这里想想如果vacuum(非完全清理)如果做的不及时,page中大量需要清理的dead 记录。vacuum做完会导致page中的记录被预留下来,为后面插入做准备,磁盘空间依然会预留下来。如果一个表更新的记录非常多,但是后面插入的非常少,这样就会导致表很大,但是实际的记录就非常少;如果做全完清理的vacuum,会释放dead 记录的磁盘空间
代码语言:javascript复制
// 查询插入的三条记录,之前看到的t_min设置为了插入事务的ID,t_max为0
sampledb=*# select * from happy;
   id   |               name               
-------- ----------------------------------
 260134 | eec3fca9d2a5f98bfdd4483b258c40bf
  89478 | 2bf9b47a7fed6e43c30378717032c9ba
 246125 | 9c0e3ad96911cb2ba8c2301f50339107
(3 rows)

// 更新该表的所有记录
sampledb=*# update happy set name=md5(random()::text)  where id>0;
UPDATE 3

// 更新后的数据
sampledb=*# select * from happy;
   id   |               name               
-------- ----------------------------------
 260134 | 39b9970f54f6ac4b709dabcd1cb7516e
  89478 | 99277b85a7a705db61d24e77ca70589e
 246125 | 42a1b0db63c95f6221526b04ce1cafaf
(3 rows)

// 通过工具查看表的t_xmin和t_xmax
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
-------- -------- -----------------------------------------------------------------------------------------
   5831 |   5831 | {"\x26f80300","\x436565633366636139643261356639386266646434343833623235386334306266"}
   5831 |   5831 | {"\x865d0100","\x433262663962343761376665643665343363333033373837313730333263396261"}
   5831 |   5831 | {"\x6dc10300","\x433963306533616439363931316362326261386332333031663530333339313037"}
   5831 |      0 | {"\x26f80300","\x433339623939373066353466366163346237303964616263643163623735313665"}
   5831 |      0 | {"\x865d0100","\x433939323737623835613761373035646236316432346537376361373035383965"}
   5831 |      0 | {"\x6dc10300","\x433432613162306462363363393566363232313532366230346365316361666166"}
(6 rows)
  • 更新记录后,分别执行非完全清理的vacuum和完全清理的vacuum.做完全清理vacuum,数据会把dead记录物理page空间归还给OS,会留实际有效记录的空间;如果做非完全清理的vacuum,存在dead记录仅仅把记录的数据清理了,dead占用物理page的磁盘空间依然保留着。
代码语言:javascript复制
// 清空表的所有page,并插入三条记录
sampledb=# truncate happy;
TRUNCATE TABLE
sampledb=# insert into happy values(260134,'39b9970f54f6ac4b709dabcd1cb7516e');
INSERT 0 1
sampledb=# insert into happy values(89478,'99277b85a7a705db61d24e77ca70589e');
INSERT 0 1
sampledb=# insert into happy values(246125,'42a1b0db63c95f6221526b04ce1cafaf');
INSERT 0 1
sampledb=# update happy set name=md5(random()::text)  where id>0;
UPDATE 3
sampledb=# select * from happy;
   id   |               name               
-------- ----------------------------------
 260134 | ab95cb5ab40f9a9a99d41667b77e6e39
  89478 | 4355842e9bbfefdedb6fa26e7d5734dd
 246125 | bc4b418a4ffa286e183310c35b3d90c3
(3 rows)


// 更新后表中page仍然有6条记录,三条dead 记录;三条新插入的记录
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
-------- -------- -----------------------------------------------------------------------------------------
   5839 |   5842 | {"\x26f80300","\x433339623939373066353466366163346237303964616263643163623735313665"}
   5840 |   5842 | {"\x865d0100","\x433939323737623835613761373035646236316432346537376361373035383965"}
   5841 |   5842 | {"\x6dc10300","\x433432613162306462363363393566363232313532366230346365316361666166"}
   5842 |      0 | {"\x26f80300","\x436162393563623561623430663961396139396434313636376237376536653339"}
   5842 |      0 | {"\x865d0100","\x433433353538343265396262666566646564623666613236653764353733346464"}
   5842 |      0 | {"\x6dc10300","\x436263346234313861346666613238366531383333313063333562336439306333"}
(6 rows)

// 查看表的大小,目前是是在一个page内
sampledb=# select relpages, relpages*8192 as total_bytes, pg_relation_size('public.happy') as relsize  from pg_class where relname = 'happy';
 relpages | total_bytes | relsize 
---------- ------------- ---------
        0 |           0 |    8192
(1 row)


// 针对表做非完全清理的vacuum
sampledb=# vacuum happy;
VACUUM

// pageinspect工具分析,可以看出原来的dead记录数据被清理,但是空间依然保留着
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
-------- -------- -----------------------------------------------------------------------------------------
        |        | 
        |        | 
        |        | 
   5842 |      0 | {"\x26f80300","\x436162393563623561623430663961396139396434313636376237376536653339"}
   5842 |      0 | {"\x865d0100","\x433433353538343265396262666566646564623666613236653764353733346464"}
   5842 |      0 | {"\x6dc10300","\x436263346234313861346666613238366531383333313063333562336439306333"}
(6 rows)

// 当前已经使用的page大小
sampledb=#  select relpages, relpages*8192 as total_bytes, pg_relation_size('public.happy') as relsize  from pg_class where relname = 'happy';
 relpages | total_bytes | relsize 
---------- ------------- ---------
        1 |        8192 |    8192
(1 row)

// 做完全清理的vacuum
sampledb=# vacuum full happy;
VACUUM

// 查询做完后的该表的磁盘空间,通过工具可以看到这个表的dead记录空闲出来的空间被回收了,仅仅只有三条记录的空间
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
-------- -------- -----------------------------------------------------------------------------------------
   5842 |      0 | {"\x26f80300","\x436162393563623561623430663961396139396434313636376237376536653339"}
   5842 |      0 | {"\x865d0100","\x433433353538343265396262666566646564623666613236653764353733346464"}
   5842 |      0 | {"\x6dc10300","\x436263346234313861346666613238366531383333313063333562336439306333"}
(3 rows)
  • 删除该表的数据,依然会更新t_xmax为执行删除操作事务的ID。如果在做vacuum happy,会回收该表中已经删除记录的空间。如果PG配置auto vacuum,会定期做vacuum,清理物理page空间。但是如果上层应用大量的删除和更新操作,vacuum未必能做的过来,会导致表过渡膨胀,dead记录依然会占用很多空间。这样是当前PG存储引擎存在的问题。社区在开发一款新的引擎zheap,参考oracle的设计,使用undo来设置mvcc.这样dead记录不会和有效数据存储在一起,能解决这个问题。
代码语言:javascript复制
// 同一个事务内插入三条记录,然后进行删除
sampledb=# begin;
BEGIN
sampledb=*# insert into happy values(260134,'39b9970f54f6ac4b709dabcd1cb7516e');
INSERT 0 1
sampledb=*# insert into happy values(89478,'99277b85a7a705db61d24e77ca70589e');
INSERT 0 1
sampledb=*# insert into happy values(246125,'42a1b0db63c95f6221526b04ce1cafaf');
INSERT 0 1

// 插入三条记录的t_xmin等于执行插入事务的ID,t_xmax=0
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
-------- -------- -----------------------------------------------------------------------------------------
   5846 |      0 | {"\x26f80300","\x433339623939373066353466366163346237303964616263643163623735313665"}
   5846 |      0 | {"\x865d0100","\x433939323737623835613761373035646236316432346537376361373035383965"}
   5846 |      0 | {"\x6dc10300","\x433432613162306462363363393566363232313532366230346365316361666166"}
(3 rows)

// 删除数据表的记录
sampledb=*# delete from happy;
DELETE 3

// 删除后数据t_xmin=t_xmax等于当前事务ID(5846)
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
 t_xmin | t_xmax |                                    tuple_data_split                                     
-------- -------- -----------------------------------------------------------------------------------------
   5846 |   5846 | {"\x26f80300","\x433339623939373066353466366163346237303964616263643163623735313665"}
   5846 |   5846 | {"\x865d0100","\x433939323737623835613761373035646236316432346537376361373035383965"}
   5846 |   5846 | {"\x6dc10300","\x433432613162306462363363393566363232313532366230346365316361666166"}
(3 rows)

// 事务提交
sampledb=*# commit;
COMMIT

// 执行非完全清理的vacuum
sampledb=# vacuum happy;
VACUUM

// 再次查询发现事务的物理空间被回收了,执行删除操作这个表没有占用任何的物理page
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
ERROR:  block number 0 is out of range for relation "happy"

0 人点赞