首先看dynamic_shared_memory_type GUC参数,该参数用于指定dynamic shared memory implementation类型(DSM_IMPL_POSIX、DSM_IMPL_SYSV、DSM_IMPL_WINDOWS、DSM_IMPL_MMAP,定义在src/include/storage/dsm_impl.h文件中)。了解一下共享内存的操作:CREATE(Create a segment whose size is the request_size and map it)、 ATTACH(Map the segment, whose size must be the request_size)、DETACH(Unmap the segment)、DESTROY(Unmap the segment, if it is mapped. Destroy the segment),PostgreSQL将其定义为dsm_op枚举类型,并使用dsm_impl_op封装不同的操作和不同的平台API(定义在src/backend/storage/ipc/dsm_impl.c文件中)。形参handle定义为typedef uint32 dsm_handle
,用于代表已经申请了的共享内存的handle或用于创建共享内存时传入的handle;形参request_size对于DSM_OP_CREATE来说,代表需要创建的共享内存的大小,否则为0;impl_private作为传入共享内存api的私有数据(Will be a pointer to NULL for the first operation on a shared memory segment within this backend; thereafter, it will point to the value to which it was set on the previous call);形参mapped_address代表需要返回的当前映射共享内存的起始地址(Pointer to start of current mapping; pointer to NULL if none. Updated with new mapping address);形参mapped_size代表需要返回的当前映射共享内存的大小(Pointer to size of current mapping; pointer to 0 if none. Updated with new mapped size)。
typedef uint32 dsm_handle; /* A "name" for a dynamic shared memory segment. */
typedef enum { /* All the shared-memory operations we know about. */
DSM_OP_CREATE, DSM_OP_ATTACH, DSM_OP_DETACH, DSM_OP_DESTROY
} dsm_op;
bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel) {
switch (dynamic_shared_memory_type){
#ifdef USE_DSM_POSIX
case DSM_IMPL_POSIX: return dsm_impl_posix(op, handle, request_size, impl_private, mapped_address, mapped_size, elevel);
#endif
#ifdef USE_DSM_SYSV
case DSM_IMPL_SYSV: return dsm_impl_sysv(op, handle, request_size, impl_private, mapped_address, mapped_size, elevel);
#endif
#ifdef USE_DSM_WINDOWS
case DSM_IMPL_WINDOWS: return dsm_impl_windows(op, handle, request_size, impl_private, mapped_address, mapped_size, elevel);
#endif
#ifdef USE_DSM_MMAP
case DSM_IMPL_MMAP: return dsm_impl_mmap(op, handle, request_size, impl_private, mapped_address, mapped_size, elevel);
#endif
default: elog(ERROR, "unexpected dynamic shared memory type: %d", dynamic_shared_memory_type);
return false;
}
}
初始化dsm control header
postmaster守护进程在CreateSharedMemoryAndSemaphores函数最后会调用void dsm_postmaster_startup(PGShmemHeader *shim)
函数初始化dynamic shared memory system。其中最重要的就是使用for死循环调用dsm_impl_op函数从机器共享内存中获取对应的共享内存,并且以random出来的数为handle。整个流程创建的结构体如上如所示。
void dsm_postmaster_startup(PGShmemHeader *shim) {
if (dynamic_shared_memory_type == DSM_IMPL_MMAP) /* If we're using the mmap implementations, clean up any leftovers. Cleanup isn't needed on Windows, and happens earlier in startup for POSIX and System V shared memory, via a direct call to dsm_cleanup_using_control_segment. */
dsm_cleanup_for_mmap();
/* Determine size for new control segment. */
uint32 maxitems = PG_DYNSHMEM_FIXED_SLOTS + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
Size segsize = dsm_control_bytes_needed(maxitems);
/* Loop until we find an unused identifier for the new control segment. We sometimes use 0 as a sentinel value indicating that no control segment is known to exist, so avoid using that value for a real control segment. */ // 循环,直到找到新控制段的未使用标识符。我们有时使用0作为标记值,表示已知不存在控制段,因此避免将该值用于实际控制段。
void *dsm_control_address = NULL;
for (;;){
dsm_control_handle = random();
if (dsm_control_handle == DSM_HANDLE_INVALID)
continue;
if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize, &dsm_control_impl_private, &dsm_control_address, &dsm_control_mapped_size, ERROR))
break;
}
dsm_control = dsm_control_address;
on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
shim->dsm_control = dsm_control_handle;
dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC; /* Initialize control segment. */
dsm_control->nitems = 0;
dsm_control->maxitems = maxitems;
}
dsm segment
dsm_create_descriptor函数用于创建dsm_segment结构体,其主要的工作就是创建dsm_segment,并初始化其成员;将其与CurrentResourceOwner进行关联;将dsm_segment加入dsm_segment_list双向链表中。
static dsm_segment *dsm_create_descriptor(void){
if (CurrentResourceOwner) ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
dsm_segment *seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
dlist_push_head(&dsm_segment_list, &seg->node);
seg->control_slot = INVALID_CONTROL_SLOT; /* seg->handle must be initialized by the caller */
seg->impl_private = NULL; seg->mapped_address = NULL; seg->mapped_size = 0;
seg->resowner = CurrentResourceOwner;
if (CurrentResourceOwner) ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
slist_init(&seg->on_detach);
return seg;
}
从上述执行逻辑可以看出如果CurrentResourceOwner为非NULL,新建的segment将与该ResourceOwner关联,在ResourceOwner释放前需要提前detach该segment,或者报出warning;如果CurrentResourceOwner为NULL,新建的segment会一直保持attach状态直到显式detached或session结束。dsm_pin_mapping函数用于解除segment与ResourceOwner的关联,这样segment会一直保持attach状态直到显式detached或session结束。dsm_pin_mapping函数可用于在进程的整个生命周期中保留映射;此函数将反转该决定,使该段归当前资源所有者所有。这在执行某些操作之前可能会很有用,这些操作会使该段失效,以供该后端将来使用。
/* Keep a dynamic shared memory mapping until end of session. By default, mappings are owned by the current resource owner, which typically means they stick around for the duration of the current query only. */
void dsm_pin_mapping(dsm_segment *seg) {
if (seg->resowner != NULL) {
ResourceOwnerForgetDSM(seg->resowner, seg); seg->resowner = NULL;
}
}
/* Arrange to remove a dynamic shared memory mapping at cleanup time. dsm_pin_mapping() can be used to preserve a mapping for the entire lifetime of a process; this function reverses that decision, making the segment owned by the current resource owner. This may be useful just before performing some operation that will invalidate the segment for future use by this backend. */
void dsm_unpin_mapping(dsm_segment *seg) {
ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
seg->resowner = CurrentResourceOwner;
ResourceOwnerRememberDSM(seg->resowner, seg);
}
创建dsm segment
dsm_create函数用于创建新的动态共享内存段,如果CurrentResourceOwner为非NULL,新建的segment将与该ResourceOwner关联,在ResourceOwner释放前需要提前detach该segment,或者报出warning;如果CurrentResourceOwner为NULL,新建的segment会一直保持attach状态直到显式detached或session结束。其执行流程:首先创建segment descriptor,然后为该segment descriptor申请共享内存;遍历dsm_control_header.item寻找空闲的槽,创建其handle到dsm_segment申请共享内存的关联(dsm_control->item[i].handle = seg->handle
),将refcnt设置为2,pinned设置为false,建立dsm_segment与dsm_control_header.item槽的关联(seg->control_slot = i
);如果没有找到空闲槽,需要回滚之前的操作。
dsm_segment *dsm_create(Size size, int flags){
if (!dsm_init_done) dsm_backend_startup();
dsm_segment *seg = dsm_create_descriptor(); /* Create a new segment descriptor. */
for (;;){ /* Loop until we find an unused segment identifier. */
seg->handle = random();
if (seg->handle == DSM_HANDLE_INVALID) /* Reserve sentinel */ continue;
if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private, &seg->mapped_address, &seg->mapped_size, ERROR)) break;
}
LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); /* Lock the control segment so we can register the new segment. */
uint32 nitems = dsm_control->nitems; /* Search the control segment for an unused slot. */ // 先从当前已分配的dsm_control_item中查找
for (uint32 i = 0; i < nitems; ++i){
if (dsm_control->item[i].refcnt == 0) {
dsm_control->item[i].impl_private_pm_handle = NULL;
dsm_control->item[i].handle = seg->handle;
dsm_control->item[i].refcnt = 2; /* refcnt of 1 triggers destruction, so start at 2 */
dsm_control->item[i].pinned = false;
seg->control_slot = i;
LWLockRelease(DynamicSharedMemoryControlLock);
return seg;
}
}
if (nitems >= dsm_control->maxitems){ /* Verify that we can support an additional mapping. */
LWLockRelease(DynamicSharedMemoryControlLock);
dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private, &seg->mapped_address, &seg->mapped_size, WARNING);
if (seg->resowner != NULL) ResourceOwnerForgetDSM(seg->resowner, seg);
dlist_delete(&seg->node); pfree(seg);
if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0) return NULL;
ereport(ERROR,(errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("too many dynamic shared memory segments")));
}
// 当前已分配的dsm_control_item中没有已经释放的,使用新的槽,并递增nitems
dsm_control->item[nitems].handle = seg->handle; /* Enter the handle into a new array slot. */
dsm_control->item[nitems].refcnt = 2; /* refcnt of 1 triggers destruction, so start at 2 */
dsm_control->item[nitems].impl_private_pm_handle = NULL;
dsm_control->item[nitems].pinned = false;
seg->control_slot = nitems;
dsm_control->nitems++;
LWLockRelease(DynamicSharedMemoryControlLock);
return seg;
}
共享dsm segment attach & detach
dsm_attach函数用于attach dynamic shared memory segment,其主要工作是新建dsm_segment,关联已经申请的共享内存(也就是说多个dsm_segmnet映射一个共享内存块,dsm_control_item.refcnt记录该共享内存被引用数量),输入参数为dsm_handle。其执行流程如下:首先判定dsm_handle是否已经由已存在的dsm_segment持有;创建dsm_segment结构体,遍历dsm_control->item
槽时在dsm_control->item[i].refcnt>1
且dsm_control->item[i].handle == seg->handle
时复用该槽,递增其refcnt;然后关联dsm_segment和共享内存。
dsm_segment *dsm_attach(dsm_handle h){
if (!dsm_init_done) dsm_backend_startup();
dsm_segment *seg; dlist_iter iter; /* If you're hitting this error, you probably want to attempt to find an existing mapping via dsm_find_mapping() before calling dsm_attach() to create a new one. */ // 首先判定dsm_handle,和dsm_create不同之处是dsm_create的dsm_handle是使用random生成的,这里是传入的
dlist_foreach(iter, &dsm_segment_list) {
seg = dlist_container(dsm_segment, node, iter.cur);
if (seg->handle == h) elog(ERROR, "can't attach the same segment more than once");
}
seg = dsm_create_descriptor(); /* Create a new segment descriptor. */
seg->handle = h;
/* Bump reference count for this segment in shared memory. */
LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
uint32 nitems = dsm_control->nitems;
for (uint32 i = 0; i < nitems; ++i) {
/* If the reference count is 0, the slot is actually unused. If the reference count is 1, the slot is still in use, but the segment is in the process of going away; even if the handle matches, another slot may already have started using the same handle value by coincidence so we have to keep searching. */ // 如果引用计数为0,则插槽实际上未使用。如果引用计数为1,则该时隙仍在使用中,但该dsm segment正在退出;即使句柄匹配,另一个插槽可能已经开始使用相同的句柄值,因此我们必须继续搜索。
if (dsm_control->item[i].refcnt <= 1) continue;
/* If the handle doesn't match, it's not the slot we want. */
if (dsm_control->item[i].handle != seg->handle) continue;
/* Otherwise we've found a match. */ // 在dsm_control->item[i].refcnt>1且dsm_control->item[i].handle == seg->handle时使用该槽
dsm_control->item[i].refcnt++;
seg->control_slot = i;
break;
}
LWLockRelease(DynamicSharedMemoryControlLock);
/* If we didn't find the handle we're looking for in the control segment, it probably means that everyone else who had it mapped, including the original creator, died before we got to this point. It's up to the caller to decide what to do about that. */ // 如果我们没有在控制段中找到要查找的句柄,这可能意味着所有映射了句柄的其他人,包括原始创建者,都在我们到达这一点之前死亡。打电话的人决定该怎么办
if (seg->control_slot == INVALID_CONTROL_SLOT){ dsm_detach(seg); return NULL; }
dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private, &seg->mapped_address, &seg->mapped_size, ERROR); /* Here's where we actually try to map the segment. */
return seg;
}
dsm_detach函数用于将dsm_segment结构体删除,如果其引用的共享内存引用计数为零,则删除该共享内存。
void dsm_detach(dsm_segment *seg) {
/* Invoke registered callbacks. Just in case one of those callbacks throws a further error that brings us back here, pop the callback before invoking it, to avoid infinite error recursion. */ // 调用已注册的回调。为了防止其中一个回调引发进一步的错误,将我们带回这里,请在调用回调之前弹出回调,以避免无限的错误递归
while (!slist_is_empty(&seg->on_detach)) {
on_dsm_detach_callback function;
slist_node *node = slist_pop_head_node(&seg->on_detach);
dsm_segment_detach_callback *cb = slist_container(dsm_segment_detach_callback, node, node);
function = cb->function;
Datum arg = cb->arg;
pfree(cb);
function(seg, arg);
}
/* Try to remove the mapping, if one exists. Normally, there will be, but maybe not, if we failed partway through a create or attach operation. We remove the mapping before decrementing the reference count so that the process that sees a zero reference count can be certain that no remaining mappings exist. Even if this fails, we pretend that it works, because retrying is likely to fail in the same way. */ // 尝试删除映射(如果存在)。通常,如果我们在创建或附加操作的中途失败,会有,但可能不会。我们在减少引用计数之前删除映射,以便看到零引用计数的进程可以确定不存在剩余的映射。即使失败了,我们也假装它有效,因为重试也可能以同样的方式失败。
if (seg->mapped_address != NULL) { // detach共享内存
dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private, &seg->mapped_address, &seg->mapped_size, WARNING);
seg->impl_private = NULL;
seg->mapped_address = NULL;
seg->mapped_size = 0;
}
if (seg->control_slot != INVALID_CONTROL_SLOT) { /* Reduce reference count, if we previously increased it. */
uint32 control_slot = seg->control_slot;
LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
uint32 refcnt = --dsm_control->item[control_slot].refcnt; // 递减共享内存引用计数
seg->control_slot = INVALID_CONTROL_SLOT;
LWLockRelease(DynamicSharedMemoryControlLock);
if (refcnt == 1){ /* If new reference count is 1, try to destroy the segment. */ // 如果仅仅只有该dsm_segment引用,需要删除共享内存
/* If we fail to destroy the segment here, or are killed before we finish doing so, the reference count will remain at 1, which will mean that nobody else can attach to the segment. At postmaster shutdown time, or when a new postmaster is started after a hard kill, another attempt will be made to remove the segment.
* The main case we're worried about here is being killed by a signal before we can finish removing the segment. In that case, it's important to be sure that the segment still gets removed. If we actually fail to remove the segment for some other reason, the postmaster may not have any better luck than we did. There's not much we can do about that, though. */
if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private, &seg->mapped_address, &seg->mapped_size, WARNING)) {
LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
dsm_control->item[control_slot].refcnt = 0;
LWLockRelease(DynamicSharedMemoryControlLock);
}
}
}
if (seg->resowner != NULL) ResourceOwnerForgetDSM(seg->resowner, seg); /* Clean up our remaining backend-private data structures. */
dlist_delete(&seg->node);
pfree(seg);
}
保留动态共享内存段 pin && unpin
dsm_pin_segment函数会保留动态共享内存段,直到postmaster关闭或调用dsm_unpin_segment。每个段不应多次调用此函数,除非在调用之间使用dsm_unpin_segment显式取消固定该段。请注意,此函数不会安排当前进程无限期地保持段映射;如果需要这种行为,则应在需要保留映射的每个进程中使用dsm_pin_mapping。Keep a dynamic shared memory segment until postmaster shutdown, or until dsm_unpin_segment is called. This function should not be called more than once per segment, unless the segment is explicitly unpinned with dsm_unpin_segment in between calls. Note that this function does not arrange for the current process to keep the segment mapped indefinitely; if that behavior is desired, dsm_pin_mapping() should be used from each process that needs to retain the mapping.
void dsm_pin_segment(dsm_segment *seg) {
void *handle;
/* Bump reference count for this segment in shared memory. This will ensure that even if there is no session which is attached to this segment, it will remain until postmaster shutdown or an explicit call to unpin. */
LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
if (dsm_control->item[seg->control_slot].pinned) elog(ERROR, "cannot pin a segment that is already pinned");
dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
dsm_control->item[seg->control_slot].pinned = true;
dsm_control->item[seg->control_slot].refcnt++;
dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
LWLockRelease(DynamicSharedMemoryControlLock);
}
dsm_unpin_segment函数取消固定以前用dsm_pin_segment固定的动态共享内存段。除非之前为此段调用了dsm_pin_segment,否则不应调用此函数。如果要取消固定尚未附加的段,参数是dsm_handle而不是dsm_segment。例如,如果对一个共享内存段的引用存储在另一个共享存储段中,则这是有用的。您可能希望在销毁引用线段之前取消固定引用线段。Unpin a dynamic shared memory segment that was previously pinned with dsm_pin_segment. This function should not be called unless dsm_pin_segment was previously called for this segment. The argument is a dsm_handle rather than a dsm_segment in case you want to unpin a segment to which you haven’t attached. This turns out to be useful if, for example, a reference to one shared memory segment is stored within another shared memory segment. You might want to unpin the referenced segment before destroying the referencing segment.
void dsm_unpin_segment(dsm_handle handle) {
uint32 control_slot = INVALID_CONTROL_SLOT;
LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
for (uint32 i = 0; i < dsm_control->nitems; ++i) { /* Find the control slot for the given handle. */
if (dsm_control->item[i].refcnt <= 1) continue; /* Skip unused slots and segments that are concurrently going away. */
if (dsm_control->item[i].handle == handle) { /* If we've found our handle, we can stop searching. */
control_slot = i; break;
}
}
/* We should definitely have found the slot, and it should not already be in the process of going away, because this function should only be called on a segment which is pinned. */
if (control_slot == INVALID_CONTROL_SLOT) elog(ERROR, "cannot unpin unknown segment handle");
if (!dsm_control->item[control_slot].pinned) elog(ERROR, "cannot unpin a segment that is not pinned");
/* Allow implementation-specific code to run. We have to do this before releasing the lock, because impl_private_pm_handle may get modified by dsm_impl_unpin_segment. */
dsm_impl_unpin_segment(handle, &dsm_control->item[control_slot].impl_private_pm_handle);
bool destroy = false;
/* Note that 1 means no references (0 means unused slot). */
if (--dsm_control->item[control_slot].refcnt == 1) destroy = true;
dsm_control->item[control_slot].pinned = false;
/* Now we can release the lock. */
LWLockRelease(DynamicSharedMemoryControlLock);
if (destroy){ /* Clean up resources if that was the last reference. */
void *junk_impl_private = NULL;
void *junk_mapped_address = NULL;
Size junk_mapped_size = 0;
if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, &junk_mapped_address, &junk_mapped_size, WARNING)){
LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
dsm_control->item[control_slot].refcnt = 0;
LWLockRelease(DynamicSharedMemoryControlLock);
}
}
}
src/backend/storage/ipc/dsm.c
src/backend/utils/mmgr/dsa.c