pageinspect分析工具
代码语言:javascript
复制// 安装 postgres extension
[root@centos-linux ~]$ mkdir /usr/local/pgsql && chown -R perrynzhou:perrynzhou /usr/local/pgsql && chmod -R 755 /usr/local/pgsql
[perrynzhou@centos-linux ~]$cd /home/perrynzhou/postgresql-14rc1 && env CFLAGS="-g -O0" ./configure
[perrynzhou@centos-linux ~]$c cd /home/perrynzhou/postgresql-14rc1/contrib && make && make install
// 安装完毕后的extension
[perrynzhou@centos-linux ~]$ ls -l /usr/local/pgsql/bin/
total 192
-rwxr-xr-x. 1 perrynzhou perrynzhou 95896 Nov 17 00:06 oid2name
-rwxr-xr-x. 1 perrynzhou perrynzhou 94632 Nov 17 00:06 vacuumlo
[perrynzhou@centos-linux ~]$ ls -l /usr/local/pgsql/lib/
total 4952
-rwxr-xr-x. 1 perrynzhou perrynzhou 70448 Nov 17 00:06 adminpack.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 173784 Nov 17 00:06 amcheck.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 24072 Nov 17 00:06 auth_delay.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 63152 Nov 17 00:06 auto_explain.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 36720 Nov 17 00:06 autoinc.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 195416 Nov 17 00:06 bloom.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 79536 Nov 17 00:06 btree_gin.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 619560 Nov 17 00:06 btree_gist.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 36792 Nov 17 00:06 citext.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 120040 Nov 17 00:06 cube.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 134000 Nov 17 00:06 dblink.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 28240 Nov 17 00:06 dict_int.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 31208 Nov 17 00:06 dict_xsyn.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 12208 Nov 17 00:06 earthdistance.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 100744 Nov 17 00:06 file_fdw.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 61136 Nov 17 00:06 fuzzystrmatch.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 229144 Nov 17 00:06 hstore.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 36376 Nov 17 00:06 insert_username.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 170080 Nov 17 00:06 _int.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 130848 Nov 17 00:06 isn.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 36168 Nov 17 00:06 lo.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 192416 Nov 17 00:06 ltree.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 36416 Nov 17 00:06 moddatetime.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 46952 Nov 17 00:06 old_snapshot.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 269768 Nov 17 00:06 pageinspect.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 16008 Nov 17 00:06 passwordcheck.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 47944 Nov 17 00:06 pg_buffercache.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 453664 Nov 17 00:06 pgcrypto.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 22744 Nov 17 00:06 pg_freespacemap.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 78784 Nov 17 00:06 pg_prewarm.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 70200 Nov 17 00:06 pgrowlocks.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 116472 Nov 17 00:06 pg_stat_statements.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 153928 Nov 17 00:06 pgstattuple.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 50928 Nov 17 00:06 pg_surgery.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 142808 Nov 17 00:06 pg_trgm.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 73536 Nov 17 00:06 pg_visibility.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 376072 Nov 17 00:06 postgres_fdw.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 49984 Nov 17 00:06 refint.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 82760 Nov 17 00:06 seg.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 74720 Nov 17 00:06 tablefunc.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 38896 Nov 17 00:06 tcn.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 68280 Nov 17 00:06 test_decoding.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 55976 Nov 17 00:06 tsm_system_rows.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 56520 Nov 17 00:06 tsm_system_time.so
-rwxr-xr-x. 1 perrynzhou perrynzhou 40912 Nov 17 00:06 unaccent.so
代码语言:javascript
复制// 启动需要取消auto vacuum
[perrynzhou@centos-linux ~]$ pg_ctl -D /postgres/data -l logfile stop
[perrynzhou@centos-linux ~]$ pg_ctl -D /postgres/data -l logfile start
[perrynzhou@centos-linux ~]$ psql -h 127.0.0.1 -d sampledb
// 加载extension
sampledb=# create extension pageinspect;
CREATE EXTENSION
// 创建测试表
sampledb=# create table happy(id int,name varchar);
CREATE TABLE
物理也分析
- 表的隐藏列的含义和分析,PG一个表包含了
tableoid/cmax/xmax/cmin/xmin/ctid
这几个隐藏列,其中tableoid是表文件的唯一标识,xmin代表当向表中插入一行数据的的事务ID;xmax字段如果值为0,标识这一行的数据没有被删除,如果表中的这行数据被删除,xmax的值就是执行这行数据删除的事务ID;cmin是插入事务内命令行标识;cmax是删除事务内命令行标识;这些标识用于MVCC中判断row是否对于其他事务可见
代码语言:javascript
复制sampledb=# d happy
Table "public.happy"
Column | Type | Collation | Nullable | Default
-------- ------------------- ----------- ---------- ---------
id | integer | | |
name | character varying | | |
sampledb=# begin;
BEGIN
// 查看表的隐藏列
sampledb=# select attname, format_type (atttypid, atttypmod) from pg_attribute as a,pg_class as b where a.attrelid = b.oid and b.relname='happy';
attname | format_type
---------- -------------------
tableoid | oid
cmax | cid
xmax | xid
cmin | cid
xmin | xid
ctid | tid
id | integer
name | character varying
// 开始插入一条数据,xmin是执行插入语句事务的ID,xmax是为0
sampledb=*# insert into happy select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
sampledb=*# select xmin,xmax,cmin,cmax,* from happy;
xmin | xmax | cmin | cmax | id | name
------ ------ ------ ------ -------- ----------------------------------
5823 | 0 | 0 | 0 | 739849 | 26297c7c1d030e111601a1ce070eae5f
(1 row)
// 删除这条数据,然后回滚观察xmax的值
sampledb=*# delete from happy;
DELETE 1
sampledb=*# rollback;
ROLLBACK
sampledb=# select xmin,xmax,cmin,cmax,* from happy;
xmin | xmax | cmin | cmax | id | name
------ ------ ------ ------ ------- ----------------------------------
5828 | 5829 | 0 | 0 | 52794 | 616bccad918d0b2de733f544bc4a6eb7
- 表中插入数据,通过pageinspect分析可以看出t_xmin和t_xmax,t_xmin对应的是隐藏字段xmin,t_xmax对应是隐藏字段xmax。insert数据后,t_xmin为当前插入事务的ID。该记录没有任何的更新t_xmax也是为0
代码语言:javascript
复制// 开启一个事务
sampledb=# begin;
BEGIN
// 插入三条记录
sampledb=*# insert into happy select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
sampledb=*# insert into happy select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
sampledb=*# insert into happy select (random()*(10^6))::integer as id, md5(random()::text) as name;
INSERT 0 1
// 通过工具分析该表的物理page,可以看出可以把数据打印出来
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
t_xmin | t_xmax | tuple_data_split
-------- -------- -----------------------------------------------------------------------------------------
5831 | 0 | {"\x26f80300","\x436565633366636139643261356639386266646434343833623235386334306266"}
5831 | 0 | {"\x865d0100","\x433262663962343761376665643665343363333033373837313730333263396261"}
5831 | 0 | {"\x6dc10300","\x433963306533616439363931316362326261386332333031663530333339313037"}
(3 rows)
- 表中更新三条记录,通过pageinspect工具分析看出PG中的表更新是先把原来的记录标记为删除(xmin=xmax设置为事务5831),接着插入三条记录,从新设置这三条记录xmin和xmax(xmin=当前事务ID5831,xmax=0).这里想想如果vacuum(非完全清理)如果做的不及时,page中大量需要清理的dead 记录。vacuum做完会导致page中的记录被预留下来,为后面插入做准备,磁盘空间依然会预留下来。如果一个表更新的记录非常多,但是后面插入的非常少,这样就会导致表很大,但是实际的记录就非常少;如果做全完清理的vacuum,会释放dead 记录的磁盘空间
代码语言:javascript
复制// 查询插入的三条记录,之前看到的t_min设置为了插入事务的ID,t_max为0
sampledb=*# select * from happy;
id | name
-------- ----------------------------------
260134 | eec3fca9d2a5f98bfdd4483b258c40bf
89478 | 2bf9b47a7fed6e43c30378717032c9ba
246125 | 9c0e3ad96911cb2ba8c2301f50339107
(3 rows)
// 更新该表的所有记录
sampledb=*# update happy set name=md5(random()::text) where id>0;
UPDATE 3
// 更新后的数据
sampledb=*# select * from happy;
id | name
-------- ----------------------------------
260134 | 39b9970f54f6ac4b709dabcd1cb7516e
89478 | 99277b85a7a705db61d24e77ca70589e
246125 | 42a1b0db63c95f6221526b04ce1cafaf
(3 rows)
// 通过工具查看表的t_xmin和t_xmax
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
t_xmin | t_xmax | tuple_data_split
-------- -------- -----------------------------------------------------------------------------------------
5831 | 5831 | {"\x26f80300","\x436565633366636139643261356639386266646434343833623235386334306266"}
5831 | 5831 | {"\x865d0100","\x433262663962343761376665643665343363333033373837313730333263396261"}
5831 | 5831 | {"\x6dc10300","\x433963306533616439363931316362326261386332333031663530333339313037"}
5831 | 0 | {"\x26f80300","\x433339623939373066353466366163346237303964616263643163623735313665"}
5831 | 0 | {"\x865d0100","\x433939323737623835613761373035646236316432346537376361373035383965"}
5831 | 0 | {"\x6dc10300","\x433432613162306462363363393566363232313532366230346365316361666166"}
(6 rows)
- 更新记录后,分别执行非完全清理的vacuum和完全清理的vacuum.做完全清理vacuum,数据会把dead记录物理page空间归还给OS,会留实际有效记录的空间;如果做非完全清理的vacuum,存在dead记录仅仅把记录的数据清理了,dead占用物理page的磁盘空间依然保留着。
代码语言:javascript
复制// 清空表的所有page,并插入三条记录
sampledb=# truncate happy;
TRUNCATE TABLE
sampledb=# insert into happy values(260134,'39b9970f54f6ac4b709dabcd1cb7516e');
INSERT 0 1
sampledb=# insert into happy values(89478,'99277b85a7a705db61d24e77ca70589e');
INSERT 0 1
sampledb=# insert into happy values(246125,'42a1b0db63c95f6221526b04ce1cafaf');
INSERT 0 1
sampledb=# update happy set name=md5(random()::text) where id>0;
UPDATE 3
sampledb=# select * from happy;
id | name
-------- ----------------------------------
260134 | ab95cb5ab40f9a9a99d41667b77e6e39
89478 | 4355842e9bbfefdedb6fa26e7d5734dd
246125 | bc4b418a4ffa286e183310c35b3d90c3
(3 rows)
// 更新后表中page仍然有6条记录,三条dead 记录;三条新插入的记录
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
t_xmin | t_xmax | tuple_data_split
-------- -------- -----------------------------------------------------------------------------------------
5839 | 5842 | {"\x26f80300","\x433339623939373066353466366163346237303964616263643163623735313665"}
5840 | 5842 | {"\x865d0100","\x433939323737623835613761373035646236316432346537376361373035383965"}
5841 | 5842 | {"\x6dc10300","\x433432613162306462363363393566363232313532366230346365316361666166"}
5842 | 0 | {"\x26f80300","\x436162393563623561623430663961396139396434313636376237376536653339"}
5842 | 0 | {"\x865d0100","\x433433353538343265396262666566646564623666613236653764353733346464"}
5842 | 0 | {"\x6dc10300","\x436263346234313861346666613238366531383333313063333562336439306333"}
(6 rows)
// 查看表的大小,目前是是在一个page内
sampledb=# select relpages, relpages*8192 as total_bytes, pg_relation_size('public.happy') as relsize from pg_class where relname = 'happy';
relpages | total_bytes | relsize
---------- ------------- ---------
0 | 0 | 8192
(1 row)
// 针对表做非完全清理的vacuum
sampledb=# vacuum happy;
VACUUM
// pageinspect工具分析,可以看出原来的dead记录数据被清理,但是空间依然保留着
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
t_xmin | t_xmax | tuple_data_split
-------- -------- -----------------------------------------------------------------------------------------
| |
| |
| |
5842 | 0 | {"\x26f80300","\x436162393563623561623430663961396139396434313636376237376536653339"}
5842 | 0 | {"\x865d0100","\x433433353538343265396262666566646564623666613236653764353733346464"}
5842 | 0 | {"\x6dc10300","\x436263346234313861346666613238366531383333313063333562336439306333"}
(6 rows)
// 当前已经使用的page大小
sampledb=# select relpages, relpages*8192 as total_bytes, pg_relation_size('public.happy') as relsize from pg_class where relname = 'happy';
relpages | total_bytes | relsize
---------- ------------- ---------
1 | 8192 | 8192
(1 row)
// 做完全清理的vacuum
sampledb=# vacuum full happy;
VACUUM
// 查询做完后的该表的磁盘空间,通过工具可以看到这个表的dead记录空闲出来的空间被回收了,仅仅只有三条记录的空间
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
t_xmin | t_xmax | tuple_data_split
-------- -------- -----------------------------------------------------------------------------------------
5842 | 0 | {"\x26f80300","\x436162393563623561623430663961396139396434313636376237376536653339"}
5842 | 0 | {"\x865d0100","\x433433353538343265396262666566646564623666613236653764353733346464"}
5842 | 0 | {"\x6dc10300","\x436263346234313861346666613238366531383333313063333562336439306333"}
(3 rows)
- 删除该表的数据,依然会更新t_xmax为执行删除操作事务的ID。如果在做vacuum happy,会回收该表中已经删除记录的空间。如果PG配置auto vacuum,会定期做vacuum,清理物理page空间。但是如果上层应用大量的删除和更新操作,vacuum未必能做的过来,会导致表过渡膨胀,dead记录依然会占用很多空间。这样是当前PG存储引擎存在的问题。社区在开发一款新的引擎zheap,参考oracle的设计,使用undo来设置mvcc.这样dead记录不会和有效数据存储在一起,能解决这个问题。
代码语言:javascript
复制// 同一个事务内插入三条记录,然后进行删除
sampledb=# begin;
BEGIN
sampledb=*# insert into happy values(260134,'39b9970f54f6ac4b709dabcd1cb7516e');
INSERT 0 1
sampledb=*# insert into happy values(89478,'99277b85a7a705db61d24e77ca70589e');
INSERT 0 1
sampledb=*# insert into happy values(246125,'42a1b0db63c95f6221526b04ce1cafaf');
INSERT 0 1
// 插入三条记录的t_xmin等于执行插入事务的ID,t_xmax=0
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
t_xmin | t_xmax | tuple_data_split
-------- -------- -----------------------------------------------------------------------------------------
5846 | 0 | {"\x26f80300","\x433339623939373066353466366163346237303964616263643163623735313665"}
5846 | 0 | {"\x865d0100","\x433939323737623835613761373035646236316432346537376361373035383965"}
5846 | 0 | {"\x6dc10300","\x433432613162306462363363393566363232313532366230346365316361666166"}
(3 rows)
// 删除数据表的记录
sampledb=*# delete from happy;
DELETE 3
// 删除后数据t_xmin=t_xmax等于当前事务ID(5846)
sampledb=*# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
t_xmin | t_xmax | tuple_data_split
-------- -------- -----------------------------------------------------------------------------------------
5846 | 5846 | {"\x26f80300","\x433339623939373066353466366163346237303964616263643163623735313665"}
5846 | 5846 | {"\x865d0100","\x433939323737623835613761373035646236316432346537376361373035383965"}
5846 | 5846 | {"\x6dc10300","\x433432613162306462363363393566363232313532366230346365316361666166"}
(3 rows)
// 事务提交
sampledb=*# commit;
COMMIT
// 执行非完全清理的vacuum
sampledb=# vacuum happy;
VACUUM
// 再次查询发现事务的物理空间被回收了,执行删除操作这个表没有占用任何的物理page
sampledb=# select t_xmin, t_xmax,tuple_data_split('public.happy'::regclass, t_data, t_infomask, t_infomask2, t_bits) from heap_page_items(get_raw_page('public.happy', 0));
ERROR: block number 0 is out of range for relation "happy"