目录
pthread的内存结构
__thread
pthread specific API
__thread和pthread specific API对比
存储区域/寻址方式不同
性能/效率不同
能存储的数据不同
支持的数据个数不同
在C/C++程序中,全局变量默认是所有线程共享的,开发者需要处理多线程竞争问题。有些情况下我们需要保证一个线程独享一份数据,其它线程无法访问。典型的就是errno全局变量,它总是会保存当前线程最后一个调用的错误码,不会存在线程冲突。这个时候需要使用线程局部存储(TLS)来解决。
pthread的内存结构
在说明TLS之前,先了解下pthread的内存结构。glibc/nptl/descr.h中定义了线程重要的数据结构struct pthread
,它描述了用户态线程的完整信息,每创建一个pthread线程,都在内存中有一个对应的pthread结构体。pthread结构非常复杂,与TLS有关的是specific_1stblock数组和specific二级数组,后面会做说明。
#define PTHREAD_KEY_2NDLEVEL_SIZE 32
#define PTHREAD_KEY_1STLEVEL_SIZE \
((PTHREAD_KEYS_MAX + PTHREAD_KEY_2NDLEVEL_SIZE - 1) \
/ PTHREAD_KEY_2NDLEVEL_SIZE)
struct pthread
{
union
{
#if !TLS_DTV_AT_TP
/* This overlaps the TCB as used for TLS without threads (see tls.h). */
tcbhead_t header;
#else
struct
{
int multiple_threads;
int gscope_flag;
} header;
#endif
void *__padding[24];
};
list_t list;
pid_t tid;
...
struct pthread_key_data
{
/* Sequence number. We use uintptr_t to not require padding on
32- and 64-bit machines. On 64-bit machines it helps to avoid
wrapping, too. */
uintptr_t seq;
/* Data pointer. */
void *data;
} specific_1stblock[PTHREAD_KEY_2NDLEVEL_SIZE];
/* Two-level array for the thread-specific data. */
struct pthread_key_data *specific[PTHREAD_KEY_1STLEVEL_SIZE];
/* Flag which is set when specific data is set. */
bool specific_used;
...
}
__thread
在GCC/Clang编译环境中,可以使用__thread
关键字来声明TLS变量,__thread
关键字不是C标准,不同的编译器名字不同。
在Xcode 13.2上测试仅i386架构不支持__thread
:
#if defined(__i386__)
static char *g_thread_data = NULL;
#else
static __thread char *g_thread_data = NULL;
#endif
使用__thread
关键字声明的变量,存储在pthred结构体之后,栈空间之间的内存区域。也就是说,从内存布局上看,高地址到底地址的内存分布是:pthred结构、__thread
变量区域、栈区(栈底和__thread
变量区顶相接)。
下面以Xcode 13.2/arm64运行的程序来说明这点。
__thread uint64_t g_tls_int = 6;
__thread char *g_tls_string = "kanchuan.com";;
void tls_test(void)
{
uint64_t value = g_tls_int;
printf("%llu", value);
char *string = g_tls_string;
printf("%s", string);
}
在tls_test入口处断点,查看对应的汇编程序,如下:
0x104235240 <+0>: sub sp, sp, #0x40 ; =0x40
0x104235244 <+4>: stp x29, x30, [sp, #0x30]
0x104235248 <+8>: add x29, sp, #0x30 ; =0x30
0x10423524c <+12>: adrp x0, 529
0x104235250 <+16>: add x0, x0, #0xd70 ; =0xd70
0x104235254 <+20>: ldr x8, [x0]
0x104235258 <+24>: blr x8
0x10423525c <+28>: str x0, [sp, #0x10]
0x104235260 <+32>: adrp x0, 529
0x104235264 <+36>: add x0, x0, #0xd88 ; =0xd88
0x104235268 <+40>: ldr x8, [x0]
0x10423526c <+44>: blr x8
0x104235270 <+48>: mov x8, x0
0x104235274 <+52>: ldr x0, [sp, #0x10]
0x104235278 <+56>: str x8, [sp, #0x18]
0x10423527c <+60>: ldr x8, [x0]
0x104235280 <+64>: stur x8, [x29, #-0x8]
0x104235284 <+68>: ldur x8, [x29, #-0x8]
0x104235288 <+72>: adrp x0, 471
0x10423528c <+76>: add x0, x0, #0x7fc ; =0x7fc
0x104235290 <+80>: mov x9, sp
0x104235294 <+84>: str x8, [x9]
0x104235298 <+88>: bl 0x104403be0 ; symbol stub for: printf
0x10423529c <+92>: ldr x0, [sp, #0x18]
0x1042352a0 <+96>: ldr x8, [x0]
0x1042352a4 <+100>: stur x8, [x29, #-0x10]
0x1042352a8 <+104>: ldur x8, [x29, #-0x10]
0x1042352ac <+108>: adrp x0, 471
0x1042352b0 <+112>: add x0, x0, #0x801 ; =0x801
0x1042352b4 <+116>: mov x9, sp
0x1042352b8 <+120>: str x8, [x9]
0x1042352bc <+124>: bl 0x104403be0 ; symbol stub for: printf
0x1042352c0 <+128>: ldp x29, x30, [sp, #0x30]
0x1042352c4 <+132>: add sp, sp, #0x40 ; =0x40
0x1042352c8 <+136>: ret
0x104235274处,sp寄存器偏移0x10字节读取到x0。在0x104235278处读取x0寄存器的值(g_tls_int):
(lldb) register read x0
x0 = 0x0000000281cf41a0
(lldb) memory read/1xg 0x0000000281cf41a0
0x281cf41a0: 0x0000000000000006
0x10423529c处,sp寄存器偏移0x18字节读取到x0。在0x1042352a0处读取x0寄存器的值(g_tls_string):
(lldb) register read x0
x0 = 0x0000000281cf41a8
(lldb) memory read/1xg 0x0000000281cf41a8
0x281cf41a8: 0x000000010440c7f0
(lldb) memory read 0x000000010440c7f0
0x10440c7f0: 65 61 73 65 61 70 69 2e 63 6f 6d 00 25 6c 6c 75 kanchuan.com.%llu
0x10440c800: 00 25 73 00 4d 79 41 70 70 6c 69 63 61 74 69 6f .%s.MyApplicatio
从上面的测试结果来看,读取__thread
变量都是通过fp指针偏移(向高地址偏移)来完成的。
__thread
修饰的变量必须是POD(Plain Old Data)类型,不支持class等高级语言特性。__thread
变量在线程生命周期一直存在,在线程销毁时释放。需要注意的是,由于__thread
并不能指定销毁方法,当我们定义一个__thread
修饰的指针变量,并在线程运行中malloc内存后,线程结束仅会将__thread
变量指针置NULL,需要开发者手动free内存。
__thread char *g_tls_string = NULL;
void tls_test(void)
{
if (g_tls_string == NULL) g_tls_string = calloc(1024, 1);
//线程销毁时,需要手动释放malloc的内存
}
如果想要在线程结束时,自动完成malloc内存的释放,需要使用pthread specific相关的API。
pthread specific API
pthread同时提供了以下API实现TLS的功能:
//nptl/bits/pthreadtypes.h
/* Keys for thread-specific data */
typedef unsigned int pthread_key_t;
int pthread_key_create(pthread_key_t *, void (* _Nullable)(void *));
int pthread_key_delete(pthread_key_t);
int pthread_setspecific(pthread_key_t , const void * _Nullable);
void* _Nullable pthread_getspecific(pthread_key_t);
pthread_key_create的第一个参数是pthread_key_t指针,用于接收创建成功返回的pthread_key_t,第二个参数是数据析构函数指针,会在线程销毁时执行。pthread_key_create成功后获得pthread_key_t,之后可通过pthread_key_t进行线程私有数据的读写。示例代码如下:
//create key
pthread_key_t key = 0;
pthread_key_create(&key, NULL);
//write
struct kanchuan_struct data;
pthread_setspecific(key, &data);
//read
struct kanchuan_struct* = (struct kanchuan_struct *)pthread_getspecific(key)
每一个进程都有一个全局数组__pthread_keys来管理pthread_key_t。
//nptl/internaltypes.h:
/* Thread-local data handling. */
struct pthread_key_struct
{
/* Sequence numbers. Even numbers indicated vacant entries. Note
that zero is even. We use uintptr_t to not require padding on
32- and 64-bit machines. On 64-bit machines it helps to avoid
wrapping, too. */
uintptr_t seq;
/* Destructor for the data. */
void (*destr) (void *);
};
//sysdeps/unix/sysv/linux/bits/local_lim.h
/* This is the value this implementation supports. */
#define PTHREAD_KEYS_MAX 1024
//nptl/pthread_keys.c
/* Table of the key information. */
struct pthread_key_struct __pthread_keys[PTHREAD_KEYS_MAX];
struct pthread_key_struct
结构中定义了seq和传入的析构函数的指针。一个程序同时最多可以创建PTHREAD_KEYS_MAX个pthread_key_t。pthread_key_t是全局的,但不同的线程通过pthread_key_t访问读写接口时,实际上操作的是不同的内存。
当执行pthread_key_create时,会从__pthread_keys数组中找到一个没有使用的pthread_key_struct结构,并对其seq加1。返回的pthread_key_t实际上就是这个pthread_key_struct在__pthread_keys数组中的序号。如下代码:
//nptl/pthread_key_create.c:
int
___pthread_key_create (pthread_key_t *key, void (*destr) (void *))
{
/* Find a slot in __pthread_keys which is unused. */
for (size_t cnt = 0; cnt < PTHREAD_KEYS_MAX; ++cnt)
{
uintptr_t seq = __pthread_keys[cnt].seq;
if (KEY_UNUSED (seq) && KEY_USABLE (seq)
/* We found an unused slot. Try to allocate it. */
&& ! atomic_compare_and_exchange_bool_acq (&__pthread_keys[cnt].seq,
seq + 1, seq))
{
/* Remember the destructor. */
__pthread_keys[cnt].destr = destr;
/* Return the key to the caller. */
*key = cnt;
/* The call succeeded. */
return 0;
}
}
return EAGAIN;
}
当执行pthread_key_delete时,会根据pthread_key_t的序号,从__pthread_keys找到对应的pthread_key_struct,并对其seq加1。如下代码:
//nptl/pthread_key_delete.c
int
___pthread_key_delete (pthread_key_t key)
{
int result = EINVAL;
if (__glibc_likely (key < PTHREAD_KEYS_MAX))
{
unsigned int seq = __pthread_keys[key].seq;
if (__builtin_expect (! KEY_UNUSED (seq), 1)
&& ! atomic_compare_and_exchange_bool_acq (&__pthread_keys[key].seq,
seq + 1, seq))
/* We deleted a valid key. */
result = 0;
}
return result;
}
注意这里使用了
atomic_compare_and_exchange_bool_acq
来保证原子操作。
seq默认为0,无论是pthread_key_create还是pthread_key_delete都是对seq加1。当seq的值是偶数(包括0)时,表示当前pthread_key_struct未被使用,为奇数时表示在使用。
通过pthread_key_create分配pthread_key_t是全局的,但键值关联却是各线程独立的。在struct pthread
结构体中有下面的定义:
struct pthread_key_data
{
/* Sequence number. We use uintptr_t to not require padding on
32- and 64-bit machines. On 64-bit machines it helps to avoid
wrapping, too. */
uintptr_t seq;
/* Data pointer. */
void *data;
} specific_1stblock[PTHREAD_KEY_2NDLEVEL_SIZE];
/* Two-level array for the thread-specific data. */
struct pthread_key_data *specific[PTHREAD_KEY_1STLEVEL_SIZE];
struct pthread_key_data
结构定义了当前线程存储TLS数据的指针data,seq和struct pthread_key_struct
的seq一样,标识了对应的key是否创建。
specific_1stblock并没有设置和PTHREAD_KEYS_MAX一样的大小,而是设置为PTHREAD_KEY_2NDLEVEL_SIZE(32)大小,这应该是从节省内存的角度设计的,大部分情况下我们并不会使用很多TLS变量。
执行pthread_setspecific时,当pthread_key_t个数小于PTHREAD_KEY_2NDLEVEL_SIZE,直接使用specific_1stblock数组;当pthread_key_t个数超过PTHREAD_KEY_2NDLEVEL_SIZE时,再申请内存空间使用specific二级数组,值存储在specific[idx1st][idx2nd].data。
//nptl/pthread_setspecific.c
int
___pthread_setspecific (pthread_key_t key, const void *value)
{
struct pthread *self;
unsigned int idx1st;
unsigned int idx2nd;
struct pthread_key_data *level2;
unsigned int seq;
self = THREAD_SELF;
/* Special case access to the first 2nd-level block. This is the
usual case. */
if (__glibc_likely (key < PTHREAD_KEY_2NDLEVEL_SIZE))
{
/* Verify the key is sane. */
if (KEY_UNUSED ((seq = __pthread_keys[key].seq)))
/* Not valid. */
return EINVAL;
level2 = &self->specific_1stblock[key];
/* Remember that we stored at least one set of data. */
if (value != NULL)
THREAD_SETMEM (self, specific_used, true);
}
else
{
if (key >= PTHREAD_KEYS_MAX
|| KEY_UNUSED ((seq = __pthread_keys[key].seq)))
/* Not valid. */
return EINVAL;
idx1st = key / PTHREAD_KEY_2NDLEVEL_SIZE;
idx2nd = key % PTHREAD_KEY_2NDLEVEL_SIZE;
/* This is the second level array. Allocate it if necessary. */
level2 = THREAD_GETMEM_NC (self, specific, idx1st);
if (level2 == NULL)
{
if (value == NULL)
/* We don't have to do anything. The value would in any case
be NULL. We can save the memory allocation. */
return 0;
level2
= (struct pthread_key_data *) calloc (PTHREAD_KEY_2NDLEVEL_SIZE,
sizeof (*level2));
if (level2 == NULL)
return ENOMEM;
THREAD_SETMEM_NC (self, specific, idx1st, level2);
}
/* Pointer to the right array element. */
level2 = &level2[idx2nd];
/* Remember that we stored at least one set of data. */
THREAD_SETMEM (self, specific_used, true);
}
/* Store the data and the sequence number so that we can recognize
stale data. */
level2->seq = seq;
level2->data = (void *) value;
return 0;
}
有了上面的分析,执行pthread_getspecific的逻辑就比较清晰了。
//nptl/pthread_getspecific.c
void *
___pthread_getspecific (pthread_key_t key)
{
struct pthread_key_data *data;
/* Special case access to the first 2nd-level block. This is the
usual case. */
if (__glibc_likely (key < PTHREAD_KEY_2NDLEVEL_SIZE))
data = &THREAD_SELF->specific_1stblock[key];
else
{
/* Verify the key is sane. */
if (key >= PTHREAD_KEYS_MAX)
/* Not valid. */
return NULL;
unsigned int idx1st = key / PTHREAD_KEY_2NDLEVEL_SIZE;
unsigned int idx2nd = key % PTHREAD_KEY_2NDLEVEL_SIZE;
/* If the sequence number doesn't match or the key cannot be defined
for this thread since the second level array is not allocated
return NULL, too. */
struct pthread_key_data *level2 = THREAD_GETMEM_NC (THREAD_SELF,
specific, idx1st);
if (level2 == NULL)
/* Not allocated, therefore no data. */
return NULL;
/* There is data. */
data = &level2[idx2nd];
}
void *result = data->data;
if (result != NULL)
{
uintptr_t seq = data->seq;
if (__glibc_unlikely (seq != __pthread_keys[key].seq))
result = data->data = NULL;
}
return result;
}
按照glibc的实现,当执行pthread_key_create获取的pthread_key_t应该是比较小的值才能优先使用specific_1stblock数组。但笔者在macOS环境测试发现获取的pthread_key_t比较大,这里应该是macOS具体的实现有和glibc不一致的地方?
__thread
和pthread specific API对比
-
存储区域/寻址方式不同
pthread specific API定义的数据,是通过struct pthread
结构体的specific_1stblock数组和specific二级数组寻址,而__thread
变量则是通过fp寄存器偏移寻址。
-
性能/效率不同
由于__thread
是通过fp寄存器偏移寻址,性能比pthread specific API高。
-
能存储的数据不同
__thread
只能修饰POD类型变量,对于指针类型的数据,有申请内存时需要手动销毁;而pthread specific API支持传入销毁方法,支持所有数据类型。
-
支持的数据个数不同
理论上只要栈不被占满,__thread
可以无限定义(存疑?);而pthread specific API只能创建PTHREAD_KEYS_MAX个key,但可以通过结构体等的方式,使用一个key存储多个值。