TableAM Table scan
TableAM提供了如下4个接口用于实现表数据的扫描功能。scan_begin函数的形参nkeys不为零,则扫描结果需要根据scan keys先进行过滤;pscan如果不为null,说明该结构体已经由parallelscan_initialize初始化过了(仅仅在table_beginscan_parallel函数中出现这种情况)。scan_begin函数返回的时后续函数都需要的TableScanDesc结构体,该结构体可以看成’父类’,实现不同的TableAM,将TableScanDesc结构体作为其第一个成员,再添加TableAM自定义的成员,从而实现’子类’。scan_getnextslot函数用于返回下一个元组,并存储在形参slot中。
/* ------------------------------------------------------------------------
* Table scan callbacks.
* ------------------------------------------------------------------------
*/
/* Start a scan of `rel`. The callback has to return a TableScanDesc, which will typically be embedded in a larger, AM specific, struct. If nkeys != 0, the results need to be filtered by those scan keys. pscan, if not NULL, will have already been initialized with parallelscan_initialize(), and has to be for the same relation. Will only be set coming from table_beginscan_parallel().
* `flags` is a bitmask indicating the type of scan (ScanOptions's SO_TYPE_*, currently only one may be specified), options controlling the scan's behaviour (ScanOptions's SO_ALLOW_*, several may be specified, an AM may ignore unsupported ones) and whether the snapshot needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT). */
TableScanDesc (*scan_begin) (Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, ParallelTableScanDesc pscan, uint32 flags);
/* Release resources and deallocate scan. If TableScanDesc.temp_snap, TableScanDesc.rs_snapshot needs to be unregistered. */
void (*scan_end) (TableScanDesc scan);
/* Restart relation scan. If set_params is set to true, allow_{strat, sync, pagemode} (see scan_begin) changes should be taken into account. */
void (*scan_rescan) (TableScanDesc scan, struct ScanKeyData *key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode);
/* Return next tuple from `scan`, store in slot. */
bool (*scan_getnextslot) (TableScanDesc scan, ScanDirection direction, TupleTableSlot *slot);
scan_begin
ScanOptions枚举类型用于向scan_begin函数中传入控制scan行为的掩码,SO_TYPE_*
用于指明扫描的类型,SO_ALLOW_STRAT用于指明是否使用access strategy访问缓冲区shared buffer,SO_ALLOW_SYNC指明是否向syncscan逻辑汇报位置,SO_ALLOW_PAGEMODE用于指明每次一页检查可见性。
/* Bitmask values for the flags argument to the scan_begin callback. */
typedef enum ScanOptions{
/* one of SO_TYPE_* may be specified */
SO_TYPE_SEQSCAN = 1 << 0, SO_TYPE_BITMAPSCAN = 1 << 1, SO_TYPE_SAMPLESCAN = 1 << 2, SO_TYPE_ANALYZE = 1 << 3, SO_TYPE_TIDSCAN = 1 << 8,
/* several of SO_ALLOW_* may be specified */
SO_ALLOW_STRAT = 1 << 4, /* allow or disallow use of access strategy */
SO_ALLOW_SYNC = 1 << 5, /* report location to syncscan logic? */
SO_ALLOW_PAGEMODE = 1 << 6, /* verify visibility page-at-a-time? */
/* unregister snapshot at scan end? */
SO_TEMP_SNAPSHOT = 1 << 7
} ScanOptions;
table_beginscan和table_beginscan_parallel用于执行SEQSCAN,返回经快照检查可见性的元组。在src/backend/executor/nodeSeqscan.c文件中的SeqNext函数会调用table_beginscan函数,如果scan为非并行,或计划并行实现确实顺序执行时,第一次进入该函数需要调用table_beginscan生成并初始化TableScanDesc函数。table_beginscan_parallel函数由执行器调用,其流程是如果由快照被序列化到共享内存中,则将快照恢复并Register,增加SO_TEMP_SNAPSHOT(scan完成后endscan Unregister该快照);否则使用调用者传入的SNAPSHOT_ANY(Any tuple is visible)。最后调用TableAM自定义的scan_begin接口。该函数在PostgreSQL数据库TableAM——HeapAM Parallel table scan已经介绍过了。用于在并行scan时替换table_beginscan函数使用。
/* Start a scan of `rel`. Returned tuples pass a visibility test of `snapshot`, and if nkeys != 0, the results are filtered by those scan keys. */
static inline TableScanDesc table_beginscan(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key) {
uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
table_beginscan_strat函数相对于table_beginscan多了两个形参用于控制access strategy使用和是否向syncscan逻辑汇报位置。
/* Like table_beginscan(), but table_beginscan_strat() offers an extended API that lets the caller control whether a nondefault buffer access strategy can be used, and whether syncscan can be chosen (possibly resulting in the scan not starting from block zero). Both of these default to true with plain table_beginscan. */
static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, bool allow_strat, bool allow_sync){
uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE;
if (allow_strat) flags |= SO_ALLOW_STRAT;
if (allow_sync) flags |= SO_ALLOW_SYNC;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
table_beginscan_bm函数用于BITMAPSCAN,table_beginscan_bm是为位图堆扫描设置TableScanDesc的另一个入口点。尽管这种扫描技术确实与标准的seqscan非常不同,但有足够的通用性,因此值得使用相同的数据结构。
/* table_beginscan_bm is an alternative entry point for setting up a TableScanDesc for a bitmap heap scan. Although that scan technology is really quite unlike a standard seqscan, there is just enough commonality to make it worth using the same data structure. */
static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key){
uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
table_beginscan_sampling是为TABLESAMPLE扫描设置TableScanDesc的另一个入口点。与位图扫描一样,使用相同的数据结构是值得的,尽管行为有所不同。除了table_beginscan_strat提供的选项外,此调用还允许控制是否使用页面模式可见性检查。
/* table_beginscan_sampling is an alternative entry point for setting up a TableScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth using the same data structure although the behavior is rather different. In addition to the options offered by table_beginscan_strat, this call also allows control of whether page-mode visibility checking is used. */
static inline TableScanDesc table_beginscan_sampling(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, bool allow_strat, bool allow_sync, bool allow_pagemode){
uint32 flags = SO_TYPE_SAMPLESCAN;
if (allow_strat) flags |= SO_ALLOW_STRAT;
if (allow_sync) flags |= SO_ALLOW_SYNC;
if (allow_pagemode) flags |= SO_ALLOW_PAGEMODE;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
table_beginscan_tid是为tid扫描设置TableScanDesc的另一个入口点。与位图扫描一样,使用相同的数据结构是值得的,尽管行为有所不同。
/* table_beginscan_tid is an alternative entry point for setting up a TableScanDesc for a Tid scan. As with bitmap scans, it's worth using the same data structure although the behavior is rather different. */
static inline TableScanDesc table_beginscan_tid(Relation rel, Snapshot snapshot) {
uint32 flags = SO_TYPE_TIDSCAN;
return rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags);
}
table_beginscan_analyze是为analyze扫描设置TableScanDesc的另一个入口点。与位图扫描一样,使用相同的数据结构是值得的,尽管行为有所不同。
/* table_beginscan_analyze is an alternative entry point for setting up a TableScanDesc for an ANALYZE scan. As with bitmap scans, it's worth using the same data structure although the behavior is rather different. */
static inline TableScanDesc table_beginscan_analyze(Relation rel) {
uint32 flags = SO_TYPE_ANALYZE;
return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags);
}
table_beginscan_catalog是为扫描系统表而提供的函数。该函数在src/backend/bootstrap/bootstrap.c、src/backend/catalog/aclchk.c等文件中都有调用。
TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key) {
uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
Oid relid = RelationGetRelid(relation);
Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key, NULL, flags);
}
scan_rescan
table_rescan和table_rescan_set_params函数都用于重启表的扫描。不同之处在于table_rescan_set_params允许在开始新扫描之前更改缓冲区策略、同步扫描和页面模式选项。注意,尽管syncscan的实际使用可能会发生变化(有效地,启用或禁用报告),但之前选择的startblock将被保留。在src/backend/executor/nodeSeqscan.c文件中的ExecReScanSeqScan函数中会调用。
/* Restart a relation scan. */
static inline void table_rescan(TableScanDesc scan, struct ScanKeyData *key){
scan->rs_rd->rd_tableam->scan_rescan(scan, key, false, false, false, false);
}
/* Restart a relation scan after changing params.
* This call allows changing the buffer strategy, syncscan, and pagemode options before starting a fresh scan. Note that although the actual use of syncscan might change (effectively, enabling or disabling reporting), the previously selected startblock will be kept. */
static inline void table_rescan_set_params(TableScanDesc scan, struct ScanKeyData *key, bool allow_strat, bool allow_sync, bool allow_pagemode){
scan->rs_rd->rd_tableam->scan_rescan(scan, key, true,allow_strat, allow_sync, allow_pagemode);
}
scan_getnextslot
table_scan_getnextslot函数用于扫描元组,并存放在slot中。在src/backend/executor/nodeSeqscan.c文件中的SeqNext函数的之后会使用table_scan_getnextslot,用于从表中获取下一个元组。
/* Return next tuple from `scan`, store in slot. */
static inline bool table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) {
slot->tts_tableOid = RelationGetRelid(sscan->rs_rd);
return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot);
}
scan_end
table_endscan函数用于结束表的扫描。在src/backend/executor/nodeSeqscan.c文件中的ExecEndSeqScan函数中最后执行TableScanDesc的清理工作。
/* End relation scan. */
static inline void table_endscan(TableScanDesc scan){
scan->rs_rd->rd_tableam->scan_end(scan);
}