源码基于:Linux 5.4
0. 前言
内核异常的级别大致分为三个:BUG、oops、panic。
BUG 是指那些不符合内核的正常设计,但内核能够检测出来并且对系统运行不会产生影响的问题,比如在原子上下文中休眠,在内核中用 BUG 标识。
Oops 就意外着内核出了异常,此时会将产生异常时出错原因,CPU的状态,出错的指令地址、数据地址及其他寄存器,函数调用的顺序甚至是栈里面的内容都打印出来,然后根据异常的严重程度来决定下一步的操作:杀死导致异常的进程或者挂起系统。
panic 本意是“恐慌”的意思,这里意旨 kernel 发生了致命错误导致无法继续运行下去的情况。根据实际情况 Oops最终也可能会导致panic 的发生。
本文将简单分析下这三种异常的流程。
1. BUG()
有过驱动调试经验的人肯定都知道这个东西,这里的 BUG 跟我们一般认为的 “软件缺陷” 可不是一回事,这里说的 BUG() 其实是linux kernel中用于拦截内核程序超出预期的行为,属于软件主动汇报异常的一种机制。这里有个疑问,就是什么时候会用到呢?一般来说有两种用到的情况:
- 一是软件开发过程中,若发现代码逻辑出现致命 fault 后就可以调用BUG()让kernel死掉(类似于assert),这样方便于定位问题,从而修正代码执行逻辑;
- 另外一种情况就是,由于某种特殊原因(通常是为了debug而需抓ramdump),我们需要系统进入kernel panic的情况下使用;
对于 arm64 来说 BUG() 定义如下:
arch/arm64/include/asm/bug.h
#ifndef _ARCH_ARM64_ASM_BUG_H
#define _ARCH_ARM64_ASM_BUG_H
#include <linux/stringify.h>
#include <asm/asm-bug.h>
#define __BUG_FLAGS(flags) \
asm volatile (__stringify(ASM_BUG_FLAGS(flags)));
#define BUG() do { \
__BUG_FLAGS(0); \
unreachable(); \
} while (0)
#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))
#define HAVE_ARCH_BUG
#include <asm-generic/bug.h>
#endif /* ! _ARCH_ARM64_ASM_BUG_H */
注意最后的 define HAVE_ARCH_BUG ,会 include asm-generict/bug.h 对 BUG() 进行重定义。
include/asm-generic/bug.h
#ifndef HAVE_ARCH_BUG
#define BUG() do { \
printk("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \
barrier_before_unreachable(); \
panic("BUG!"); \
} while (0)
#endif
#ifndef HAVE_ARCH_BUG_ON
#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
#endif
也就是在Linux 5.4版本 BUG() 和 BUG_ON() 都是执行的 panic()。
而若没有定义 HAVE_ARCH_BUG 这个宏,BUG() 会向CPU 下发一条未定义指令而触发ARM 发起未定义指令异常,随后进入 kernel 异常处理流程,通过调用die() 经历Oops 和 panic,下面会单独分析 die() 函数,详细看第 2 节。
2. die()
arch/arm64/kernel/traps.c
static DEFINE_RAW_SPINLOCK(die_lock);
/*
* This function is protected against re-entrancy.
*/
void die(const char *str, struct pt_regs *regs, int err)
{
int ret;
unsigned long flags;
raw_spin_lock_irqsave(&die_lock, flags);
oops_enter();
console_verbose();
bust_spinlocks(1);
ret = __die(str, err, regs);
if (regs && kexec_should_crash(current))
crash_kexec(regs);
bust_spinlocks(0);
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
oops_exit();
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
raw_spin_unlock_irqrestore(&die_lock, flags);
if (ret != NOTIFY_STOP)
do_exit(SIGSEGV);
}
oops_enter() ---> oops_exit() 为Oops 的处理流程,获取console 的log 级别,并通过 __die() 通过对Oops 感兴趣的模块进行callback,打印模块状态不为 MODULE_STATE_UNFORMED 的模块信息,打印PC、LR、SP、x0 等寄存器信息,打印调用栈信息,等等。
2.1 __die()
arch/arm64/kernel/traps.c
static int __die(const char *str, int err, struct pt_regs *regs)
{
static int die_counter;
int ret;
pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n",
str, err, ++die_counter);
/* trap and error numbers are mostly meaningless on ARM */
ret = notify_die(DIE_OOPS, str, regs, err, 0, SIGSEGV);
if (ret == NOTIFY_STOP)
return ret;
print_modules();
show_regs(regs);
dump_kernel_instr(KERN_EMERG, regs);
return ret;
}
- 打印 EMERG 的log,Internal error: oops.....;
- notify_die() 会通知所有对 Oops 感兴趣的模块并进行callback;
- print_modules() 打印模块状态不为 MODULE_STATE_UNFORMED 的模块信息;
- show_regs() 打印PC、LR、SP 等寄存器的信息,同时打印调用堆栈信息;
- dump_kernel_instr() 打印 pc指针和前4条指令;
这里不过多的剖析,感兴趣的可以查看下源码。
这里需要注意的是 notify_die() 会通知所有的Oops 感兴趣的模块,模块会通过函数 register_die_notifier() 将callback 注册到全局结构体变量 die_chain 中(多个模块注册进来形成一个链表),然后在通过 notify_die() 函数去解析这个 die_chain,并分别调用callback:
kernel/notifier.c
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
.regs = regs,
.str = str,
.err = err,
.trapnr = trap,
.signr = sig,
};
RCU_LOCKDEP_WARN(!rcu_is_watching(),
"notify_die called but RCU thinks we're quiescent");
return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
vmalloc_sync_mappings();
return atomic_notifier_chain_register(&die_chain, nb);
}
2.2 oops同时有可能panic
从上面 die() 函数最后看到,oops_exit() 之后也有可能进入panic():
arch/arm64/kernel/traps.c
void die(const char *str, struct pt_regs *regs, int err)
{
...
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
...
}
处于中断 或 panic_on_oops 打开时进入 panic。
中断的可能性:
- 硬件 IRQ;
- 软件 IRQ;
- NMI;
panic_on_oops 的值受 CONFIG_PANIC_ON_OOPS_VALUE 影响。当然该值也可以通过节点
/proc/sys/kernel/panic_on_oops 进行动态修改。
3. panic()
panic 本意是“恐慌”的意思,这里意旨kernel发生了致命错误导致无法继续运行下去的情况。
kernel/panic.c
/**
* panic - halt the system
* @fmt: The text string to print
*
* Display a message, then perform cleanups.
*
* This function never returns.
*/
void panic(const char *fmt, ...)
{
static char buf[1024];
va_list args;
long i, i_next = 0, len;
int state = 0;
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
//禁止本地中断,避免出现死锁,因为无法防止中断处理程序(在获得panic锁后运行)再次被调用panic
local_irq_disable();
//禁止任务抢占
preempt_disable_notrace();
//只允许一个CPU执行该代码,通过 panic_smp_self_stop() 保证当一个CPU执行panic时,
//其他CPU处于停止或等待状态;
this_cpu = raw_smp_processor_id();
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
panic_smp_self_stop();
//把console的打印级别放开
console_verbose();
bust_spinlocks(1);
va_start(args, fmt);
len = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
if (len && buf[len - 1] == '\n')
buf[len - 1] = '\0';
//解析panic所携带的message,前缀为Kernel panic - not syncing
pr_emerg("Kernel panic - not syncing: %s\n", buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
dump_stack();
#endif
//如果kgdb使能,即CONFIG_KGDB为y,在停掉所有其他CPU之前,跳转kgdb断点运行
kgdb_panic(buf);
/*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
* If we want to run this after calling panic_notifiers, pass
* the "crash_kexec_post_notifiers" option to the kernel.
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!_crash_kexec_post_notifiers) {
printk_safe_flush_on_panic();
__crash_kexec(NULL);
//停掉其他CPU,只留下当前CPU干活
smp_send_stop();
} else {
/*
* If we want to do crash dump after notifier calls and
* kmsg_dump, we will need architecture dependent extra
* works in addition to stopping other CPUs.
*/
crash_smp_send_stop();
}
//通知所有对panic感兴趣的模块进行回调,添加一些kmsg信息到输出
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
/* Call flush even twice. It tries harder with a single online CPU */
printk_safe_flush_on_panic();
//输出panic的kernel log信息
kmsg_dump(KMSG_DUMP_PANIC);
/*
* If you doubt kdump always works fine in any situation,
* "crash_kexec_post_notifiers" offers you a chance to run
* panic_notifiers and dumping kmsg before kdump.
* Note: since some panic_notifiers can make crashed kernel
* more unstable, it can increase risks of the kdump failure too.
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
#ifdef CONFIG_VT
unblank_screen();
#endif
console_unblank();
/*
* We may have ended up stopping the CPU holding the lock (in
* smp_send_stop()) while still having some valuable data in the console
* buffer. Try to acquire the lock then release it regardless of the
* result. The release will also print the buffers out. Locks debug
* should be disabled to avoid reporting bad unlock balance when
* panic() is not being callled from OOPS.
*/
debug_locks_off();
console_flush_on_panic(CONSOLE_FLUSH_PENDING);
panic_print_sys_info();
if (!panic_blink)
panic_blink = no_blink;
//如果sysctl配置了panic_timeout > 0则在panic_timeout后重启系统
if (panic_timeout > 0) {
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
if (i >= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER_STEP);
}
}
if (panic_timeout != 0) {
/*
* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
if (panic_reboot_mode != REBOOT_UNDEFINED)
reboot_mode = panic_reboot_mode;
emergency_restart();
}
#ifdef __sparc__
{
extern int stop_a_enabled;
/* Make sure the user can actually press Stop-A (L1-A) */
stop_a_enabled = 1;
pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n"
"twice on console to return to the boot prom\n");
}
#endif
#if defined(CONFIG_S390)
disabled_wait();
#endif
pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
/* Do not scroll important messages printed above */
suppress_printk = 1;
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
if (i >= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER_STEP);
}
}
EXPORT_SYMBOL(panic);