本小节主要讲解查询执行模块,有机地调用存储、索引、事务、并发等模块,按照执行计划中的计划节点(操作执行接口)完成数据的读取或者修改。知识回顾:postgres源码解析37 表创建执行全流程梳理–1
关键数据结构
总图:
1 PortalData
postgres为每条SQL创建一个portal结构体存储查询计划树链表和执行策略等信息:
typedef struct PortalData *Portal;
typedef struct PortalData
{
/* Bookkeeping data */
const char *name; /* portal's name */
const char *prepStmtName; /* source prepared statement (NULL if none) */
MemoryContext portalContext; /* subsidiary memory for portal */ // portal上下文
ResourceOwner resowner; /* resources owned by portal */ // portal持有的资源
void (*cleanup) (Portal portal); /* cleanup hook */ // 清理函数
/*
* State data for remembering which subtransaction(s) the portal was
* created or used in. If the portal is held over from a previous
* transaction, both subxids are InvalidSubTransactionId. Otherwise,
* createSubid is the creating subxact and activeSubid is the last subxact
* in which we ran the portal.
*/
SubTransactionId createSubid; /* the creating subxact */
SubTransactionId activeSubid; /* the last subxact with activity */
/* The query or queries the portal will execute */
const char *sourceText; /* text of query (as of 8.4, never NULL) */ // 输入的SQL语句
CommandTag commandTag; /* command tag for original query */ // Tag标识
QueryCompletion qc; /* command completion data for executed query */
List *stmts; /* list of PlannedStmts */ // PlannedStmt结构体
CachedPlan *cplan; /* CachedPlan, if stmts are from one */ // Cache plan
ParamListInfo portalParams; /* params to pass to query */ // portal参数链表
QueryEnvironment *queryEnv; /* environment for query */ // 查询环境
/* Features/options */
PortalStrategy strategy; /* see above */ // 策略
int cursorOptions; /* DECLARE CURSOR option bits */ // 游标 bits
bool run_once; /* portal will only be run once */ //是否运行一次
/* Status data */
PortalStatus status; /* see above */ // portal状态
bool portalPinned; /* a pinned portal can't be dropped */ // 是否被 pinned
bool autoHeld; /* was automatically converted from pinned to
* held (see HoldPinnedPortals()) */
/* If not NULL, Executor is active; call ExecutorEnd eventually: */
QueryDesc *queryDesc; /* info needed for executor invocation */ /// 查询描述符
/* If portal returns tuples, this is their tupdesc: */
TupleDesc tupDesc; /* descriptor for result tuples */ // 元组描述符
/* and these are the format codes to use for the columns: */
int16 *formats; /* a format code for each column */ // 列形式
/*
* Outermost ActiveSnapshot for execution of the portal's queries. For
* all but a few utility commands, we require such a snapshot to exist.
* This ensures that TOAST references in query results can be detoasted,
* and helps to reduce thrashing of the process's exposed xmin.
*/
Snapshot portalSnapshot; /* active snapshot, or NULL if none */ // 快照信息
/*
* Where we store tuples for a held cursor or a PORTAL_ONE_RETURNING or
* PORTAL_UTIL_SELECT query. (A cursor held past the end of its
* transaction no longer has any active executor state.)
*/
Tuplestorestate *holdStore; /* store for holdable cursors */
MemoryContext holdContext; /* memory containing holdStore */
/*
* Snapshot under which tuples in the holdStore were read. We must keep a
* reference to this snapshot if there is any possibility that the tuples
* contain TOAST references, because releasing the snapshot could allow
* recently-dead rows to be vacuumed away, along with any toast data
* belonging to them. In the case of a held cursor, we avoid needing to
* keep such a snapshot by forcibly detoasting the data.
*/
Snapshot holdSnapshot; /* registered snapshot, or NULL if none */
/*
* atStart, atEnd and portalPos indicate the current cursor position.
* portalPos is zero before the first row, N after fetching N'th row of
* query. After we run off the end, portalPos = # of rows in query, and
* atEnd is true. Note that atStart implies portalPos == 0, but not the
* reverse: we might have backed up only as far as the first row, not to
* the start. Also note that various code inspects atStart and atEnd, but
* only the portal movement routines should touch portalPos.
*/
bool atStart;
bool atEnd;
uint64 portalPos;
/* Presentation data, primarily used by the pg_cursors system view */
TimestampTz creation_time; /* time at which this portal was defined */
bool visible; /* include this portal in pg_cursors? */
} PortalData;
根据SQL语句的类型的,pg提供5种执行策略:
1)PORTAL_ONE_SELECT:针对单一SELECT语句,支持游标操作
2)PORTAL_ONE_RETURNING:针对含有RETURNING子句的单一 INSERT/DETELE/UPDATE语句。查询执行时,将首先处理所有条件的元组,在执行过程中缓存,最后返回给客户端
3)PORTAL_ONE_MOD_WITH:针对含有data-modifying CTEs 的单一SELECT语句查询
4)PORTAL_UTIL_SELECT: 针对功能性语句但返回结果类似SELECT语句,如EXPLAIN/SHOW
5)PORTAL_MULTI_QUERY: 除上述4中类型之外的语句,这个策略具有一般性,能够处理一个或者多个操作
typedef enum PortalStrategy
{
PORTAL_ONE_SELECT,
PORTAL_ONE_RETURNING,
PORTAL_ONE_MOD_WITH,
PORTAL_UTIL_SELECT,
PORTAL_MULTI_QUERY
} PortalStrategy;
PlannedStmt 计划树头节点为plannnedStmt数据结构,其保存执行器所需的一些信息:语句类型(commandType),查询计划树根节点(planTree)、查询涉及的范围表(rtable)和结果关系表(resultRelation)
typedef struct PlannedStmt
{
NodeTag type;
CmdType commandType; /* select|insert|update|delete|utility */
uint64 queryId; /* query identifier (copied from Query) */
bool hasReturning; /* is it insert|update|delete RETURNING? */
bool hasModifyingCTE; /* has insert|update|delete in WITH? */
bool canSetTag; /* do I set the command result tag? */
bool transientPlan; /* redo plan when TransactionXmin changes? */
bool dependsOnRole; /* is plan specific to current role? */
bool parallelModeNeeded; /* parallel mode required to execute? */ // 并行模式
int jitFlags; /* which forms of JIT should be performed */
struct Plan *planTree; /* tree of Plan nodes */
List *rtable; /* list of RangeTblEntry nodes */ // 范围表
/* rtable indexes of target relations for INSERT/UPDATE/DELETE */
List *resultRelations; /* integer list of RT indexes, or NIL */
List *appendRelations; /* list of AppendRelInfo nodes */
List *subplans; /* Plan trees for SubPlan expressions; note
* that some could be NULL */
Bitmapset *rewindPlanIDs; /* indices of subplans that require REWIND */
List *rowMarks; /* a list of PlanRowMark's */
List *relationOids; /* OIDs of relations the plan depends on */ // oid
List *invalItems; /* other dependencies, as PlanInvalItems */
List *paramExecTypes; /* type OIDs for PARAM_EXEC Params */
Node *utilityStmt; /* non-null if this is utility stmt */ // 功能性语句
/* statement location in source string (copied from Query) */
int stmt_location; /* start location, or -1 if unknown */
int stmt_len; /* length in bytes; 0 means "rest of string" */
} PlannedStmt;
plan的子类节点通过lefttree和righttree字段完成整个查询计划,其根节点指针被保存在PlannedStmt结构体中。包含代价等信息(执行计划的生成的基于CBO生成的)、投影以及位图参数等信息
typedef struct Plan
{
NodeTag type;
/*
* estimated execution costs for plan (see costsize.c for more info)
*/
Cost startup_cost; /* cost expended before fetching any tuples */
Cost total_cost; /* total cost (assuming all tuples fetched) */
/*
* planner's estimate of result size of this plan step
*/
double plan_rows; /* number of rows plan is expected to emit */
int plan_width; /* average row width in bytes */
/*
* information needed for parallel query
*/
bool parallel_aware; /* engage parallel-aware logic? */
bool parallel_safe; /* OK to use as part of parallel plan? */
/*
* information needed for asynchronous execution
*/
bool async_capable; /* engage asynchronous-capable logic? */
/*
* Common structural data for all Plan types.
*/
int plan_node_id; /* unique across entire final plan tree */
List *targetlist; /* target list to be computed at this node */
List *qual; /* implicitly-ANDed qual conditions */
struct Plan *lefttree; /* input plan tree(s) */
struct Plan *righttree;
List *initPlan; /* Init Plan nodes (un-correlated expr
* subselects) */
/*
* Information for management of parameter-change-driven rescanning
*
* extParam includes the paramIDs of all external PARAM_EXEC params
* affecting this plan node or its children. setParam params from the
* node's initPlans are not included, but their extParams are.
*
* allParam includes all the extParam paramIDs, plus the IDs of local
* params that affect the node (i.e., the setParams of its initplans).
* These are _all_ the PARAM_EXEC params that affect this node.
*/
Bitmapset *extParam;
Bitmapset *allParam;
} Plan;
plan节点在执行过程过会生成对应的PlanState结构指针,计划节点的PlanState结构也会按照查询计划树的结构组织成计划节点执行状态树,
typedef struct PlanState
{
NodeTag type;
Plan *plan; /* associated Plan node */
EState *state; /* at execution time, states of individual
* nodes point to one EState for the whole
* top-level plan */
ExecProcNodeMtd ExecProcNode; /* function to return next tuple */ // 返回下条元组的函数
ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a //真正执行函数
* wrapper */
Instrumentation *instrument; /* Optional runtime stats for this node */
WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */
/* Per-worker JIT instrumentation */
struct SharedJitInstrumentation *worker_jit_instrument;
/*
* Common structural data for all Plan types. These links to subsidiary
* state trees parallel links in the associated plan tree (except for the
* subPlan list, which does not exist in the plan tree).
*/
ExprState *qual; /* boolean qual condition */ //
struct PlanState *lefttree; /* input plan tree(s) */
struct PlanState *righttree;
List *initPlan; /* Init SubPlanState nodes (un-correlated expr
* subselects) */
List *subPlan; /* SubPlanState nodes in my expressions */
/*
* State for management of parameter-change-driven rescanning
*/
Bitmapset *chgParam; /* set of IDs of changed Params */
/*
* Other run-time state needed by most if not all node types.
*/
TupleDesc ps_ResultTupleDesc; /* node's return type */ //元组描述符
TupleTableSlot *ps_ResultTupleSlot; /* slot for my result tuples */ //存放结果元组槽
ExprContext *ps_ExprContext; /* node's expression-evaluation context */// 节点表达式上下文
ProjectionInfo *ps_ProjInfo; /* info for doing tuple projection */ // 投影信息
bool async_capable; /* true if node is async-capable */
/*
* Scanslot's descriptor if known. This is a bit of a hack, but otherwise
* it's hard for expression compilation to optimize based on the
* descriptor, without encoding knowledge about all executor nodes.
*/
TupleDesc scandesc;
/*
* Define the slot types for inner, outer and scanslots for expression
* contexts with this state as a parent. If *opsset is set, then
* *opsfixed indicates whether *ops is guaranteed to be the type of slot
* used. That means that every slot in the corresponding
* ExprContext.ecxt_*tuple will point to a slot of that type, while
* evaluating the expression. If *opsfixed is false, but *ops is set,
* that indicates the most likely type of slot.
*
* The scan* fields are set by ExecInitScanTupleSlot(). If that's not
* called, nodes can initialize the fields themselves.
*
* If outer/inneropsset is false, the information is inferred on-demand
* using ExecGetResultSlotOps() on ->righttree/lefttree, using the
* corresponding node's resultops* fields.
*
* The result* fields are automatically set when ExecInitResultSlot is
* used (be it directly or when the slot is created by
* ExecAssignScanProjectionInfo() /
* ExecConditionalAssignProjectionInfo()). If no projection is necessary
* ExecConditionalAssignProjectionInfo() defaults those fields to the scan
* operations.
*/
const TupleTableSlotOps *scanops;
const TupleTableSlotOps *outerops;
const TupleTableSlotOps *innerops;
const TupleTableSlotOps *resultops;
bool scanopsfixed;
bool outeropsfixed;
bool inneropsfixed;
bool resultopsfixed;
bool scanopsset;
bool outeropsset;
bool inneropsset;
bool resultopsset;
} PlanState;
TupleDescData结构体记录了元组的属性,引用计数和约束等信息,用于元组各属性的扫描
typedef struct TupleDescData
{
int natts; /* number of attributes in the tuple */
Oid tdtypeid; /* composite type ID for tuple type */
int32 tdtypmod; /* typmod for tuple type */
int tdrefcount; /* reference count, or -1 if not counting */
TupleConstr *constr; /* constraints, or NULL if none */
/* attrs[N] is the description of Attribute Number N+1 */
FormData_pg_attribute attrs[FLEXIBLE_ARRAY_MEMBER];
} TupleDescData;
执行器执行过中会构造全局执行状态信息Estate,该结构体记录了查询涉及的范围表、上下文、用于在节点之间传递元组的全局元组表和元组上下文(每次使用后便会释放)
typedef struct EState
{
NodeTag type;
/* Basic state for all query types: */
ScanDirection es_direction; /* current scan direction */
Snapshot es_snapshot; /* time qual to use */
Snapshot es_crosscheck_snapshot; /* crosscheck time qual for RI */
List *es_range_table; /* List of RangeTblEntry */
Index es_range_table_size; /* size of the range table arrays */
Relation *es_relations; /* Array of per-range-table-entry Relation
* pointers, or NULL if not yet opened */
struct ExecRowMark **es_rowmarks; /* Array of per-range-table-entry
* ExecRowMarks, or NULL if none */
PlannedStmt *es_plannedstmt; /* link to top of plan tree */
const char *es_sourceText; /* Source text from QueryDesc */
JunkFilter *es_junkFilter; /* top-level junk filter, if any */
/* If query can insert/delete tuples, the command ID to mark them with */
CommandId es_output_cid;
/* Info about target table(s) for insert/update/delete queries: */
ResultRelInfo **es_result_relations; /* Array of per-range-table-entry
* ResultRelInfo pointers, or NULL
* if not a target table */
List *es_opened_result_relations; /* List of non-NULL entries in
* es_result_relations in no
* specific order */
PartitionDirectory es_partition_directory; /* for PartitionDesc lookup */
/*
* The following list contains ResultRelInfos created by the tuple routing
* code for partitions that aren't found in the es_result_relations array.
*/
List *es_tuple_routing_result_relations;
/* Stuff used for firing triggers: */
List *es_trig_target_relations; /* trigger-only ResultRelInfos */
/* Parameter info: */
ParamListInfo es_param_list_info; /* values of external params */
ParamExecData *es_param_exec_vals; /* values of internal params */
QueryEnvironment *es_queryEnv; /* query environment */
/* Other working state: */
MemoryContext es_query_cxt; /* per-query context in which EState lives */
List *es_tupleTable; /* List of TupleTableSlots */
uint64 es_processed; /* # of tuples processed */
int es_top_eflags; /* eflags passed to ExecutorStart */
int es_instrument; /* OR of InstrumentOption flags */
bool es_finished; /* true when ExecutorFinish is done */
List *es_exprcontexts; /* List of ExprContexts within EState */
List *es_subplanstates; /* List of PlanState for SubPlans */
List *es_auxmodifytables; /* List of secondary ModifyTableStates */
/*
* this ExprContext is for per-output-tuple operations, such as constraint
* checks and index-value computations. It will be reset for each output
* tuple. Note that it will be created only if needed.
*/
ExprContext *es_per_tuple_exprcontext;
/*
* If not NULL, this is an EPQState's EState. This is a field in EState
* both to allow EvalPlanQual aware executor nodes to detect that they
* need to perform EPQ related work, and to provide necessary information
* to do so.
*/
struct EPQState *es_epq_active;
bool es_use_parallel_mode; /* can we use parallel workers? */
/* The per-query shared memory area to use for parallel execution. */
struct dsa_area *es_query_dsa;
/*
* JIT information. es_jit_flags indicates whether JIT should be performed
* and with which options. es_jit is created on-demand when JITing is
* performed.
*
* es_jit_worker_instr is the combined, on demand allocated,
* instrumentation from all workers. The leader's instrumentation is kept
* separate, and is combined on demand by ExplainPrintJITSummary().
*/
int es_jit_flags;
struct JitContext *es_jit;
struct JitInstrumentation *es_jit_worker_instr;
} EState;