阅读顺序
《Postgresql源码(30)Postgresql索引基础B-linked-tree》
《Postgresql源码(31)Btree索引相关系统表和整体结构》
《Postgresql源码(32)Btree索引分裂前后结构差异对比》
《Postgresql源码(33)Btree索引读——整体流程&_bt_first》
《Postgresql源码(34)Btree索引读——_bt_first搜索部分分析》
《Postgresql源码(36)Btree索引读——_bt_next搜索部分分析》
总结
- BTScanPosData会在so->currPos->items缓存当前查询页面的ctid。每次查询完一个页面,会使用_bt_steppage更新currPos的内容。
- 叶子页面加锁特点:_bt_steppage会使用
_bt_readnextpage
打开下一个页面,加上锁之后会开始检查数据,把符合要求的数据的heap tid记录到so->currPos->items中。同一时间只会持一把锁。 - branch页面加锁特点:btree在爬非叶子节点的时候,从root到leaf的过程是用_bt_relandgetbuf加锁的,也就是先放一个老的锁,在加下一个页面的锁。同一时间只会持一把锁。(
_bt_search
)
页面结构和预期
继续分析34篇提到的这条SQL
用于分析的SQL预期:_bt_next会扫过1、2、4三个leaf页面
代码语言:javascript复制-- 起始 > (1,19) 终止 < (3,59)
-- PAGE1:扫描1条:itemoffset 141-141
-- PAGE2:扫描140条:itemoffset 2-141
-- PAGE4:扫描139条:itemoffset 2-140
select * from t81 where id>139 and id<419;
数据构建
代码语言:javascript复制create table t81(id int, info text);
create index idx_t81_id_info on t81(id,info);
insert into t81 select generate_series(1,149370), md5(random()::text);
select * from bt_metap('idx_t81_id_info');
magic | version | root | level | fastroot | fastlevel
-------- --------- ------ ------- ---------- -----------
340322 | 2 | 116 | 2 | 116 | 2
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 116);
itemoffset | ctid
------------ ----------
1 | (3,1)
2 | (115,1)
3 | (227,1)
4 | (338,1)
5 | (449,1)
6 | (560,1)
7 | (671,1)
8 | (782,1)
9 | (893,1)
10 | (1004,1)
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 1);
itemoffset | ctid
------------ ---------
1 | (1,21)
2 | (0,1)
3 | (0,2)
4 | (0,3)
5 | (0,4)
...
140 | (1,19) <------------------------ start
141 | (1,20)
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 2);
itemoffset | ctid
------------ ---------
1 | (2,41)
2 | (1,21)
3 | (1,22)
4 | (1,23)
5 | (1,24)
6 | (1,25)
...
140 | (2,39)
141 | (2,40)
select itemoffset,ctid from bt_page_items('idx_t81_id_info', 4);
itemoffset | ctid
------------ ---------
1 | (3,61)
2 | (2,41)
3 | (2,42)
4 | (2,43)
5 | (2,44)
6 | (2,45)
7 | (2,46)
...
140 | (3,59) <------------------------ end
141 | (3,60)
select * from t81 where ctid='(1,19)';
id | info
----- ----------------------------------
143 | 2c33d634b63de9c16306ac81abd2dcf2
select * from t81 where ctid='(3,59)';
id | info
----- ----------------------------------
423 | d13e311ad061155067e44af2c655d5b2
-- 起始 > (1,19) 终止 < (3,59)
-- PAGE1:扫描1条:itemoffset 141-141
-- PAGE2:扫描140条:itemoffset 2-141
-- PAGE4:扫描139条:itemoffset 2-140
select * from t81 where id>139 and id<419;
前面《Postgresql源码(33)Btree索引读——整体流程&_bt_first》
已经对定位初始页做了分析,这里已经拿到了初始ctid,继续分析后面的搜索流程。
重点关注scan过程用到的数据结构。
_bt_next
b PortalRun b _bt_next
在_bt_first执行后,BTScanOpaque的数据已经完整,这里缓存了查询下一条所需要的全部数据。
函数流程比较简单:
代码语言:javascript复制bool
_bt_next(IndexScanDesc scan, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
BTScanPosItem *currItem;
/*
* Advance to next tuple on current page; or if there's no more, try to
* step to the next page with data.
*/
if (ScanDirectionIsForward(dir))
{
if ( so->currPos.itemIndex > so->currPos.lastItem)
{
if (!_bt_steppage(scan, dir))
return false;
}
}
else
{
if (--so->currPos.itemIndex < so->currPos.firstItem)
{
if (!_bt_steppage(scan, dir))
return false;
}
}
/* OK, itemIndex says what to return */
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples currItem->tupleOffset);
return true;
}
这里重点看下BTScanOpaque用到的currPos数据结构和当前内容。
代码语言:javascript复制typedef struct BTScanOpaqueData
{
...
BTScanPosData currPos; /* current position data */
...
} BTScanOpaqueData;
BTScanPosData
代码语言:javascript复制typedef struct BTScanPosData
{
Buffer buf; /* if valid, the buffer is pinned */
XLogRecPtr lsn; /* pos in the WAL stream when page was read */
BlockNumber currPage; /* page referenced by items array */
BlockNumber nextPage; /* page's right link when we scanned it */
/*
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
* We can clear the appropriate one of these flags when _bt_checkkeys()
* returns continuescan = false.
*/
bool moreLeft;
bool moreRight;
/*
* If we are doing an index-only scan, nextTupleOffset is the first free
* location in the associated tuple storage workspace.
*/
int nextTupleOffset;
/*
* The items array is always ordered in index order (ie, increasing
* indexoffset). When scanning backwards it is convenient to fill the
* array back-to-front, so we start at the last slot and fill downwards.
* Hence we need both a first-valid-entry and a last-valid-entry counter.
* itemIndex is a cursor showing which entry was last returned to caller.
*/
int firstItem; /* first valid index in items[] */
int lastItem; /* last valid index in items[] */
int itemIndex; /* current index in items[] */
BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
} BTScanPosData;
typedef struct BTScanPosItem /* what we remember about each match */
{
ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */
LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
} BTScanPosItem;
执行SQL后,第一次_bt_next
代码语言:javascript复制-- 起始 > (1,19) 终止 < (3,59)
-- PAGE1:扫描1条:itemoffset 141-141
-- PAGE2:扫描140条:itemoffset 2-141
-- PAGE4:扫描139条:itemoffset 2-140
select * from t81 where id>139 and id<419;
这里应该扫描PAGE1的最后一条:
代码语言:javascript复制(gdb) p so->currPos
{
buf = 150,
lsn = 128202491872,
currPage = 1,
nextPage = 2,
moreLeft = 0 '