开发了一个内核ko模块async_memory_reclaim_for_cold_file_area(内核版本5.14.0-284.11.1),使用kprobe技术捕捉内核copy_page_to_iter()函数,在里边执行自定义的hot_file_update_file_status()函数,统计文件页page的访问信息,源码简要如下:
- static void kprobe_handler_post(struct kprobe *p, struct pt_regs *regs,
- ??????? unsigned long flags)
- {
- ??? struct page *page = (struct page *)(regs->di);
- ??? if(page){
- ??????? hot_file_update_file_status(page);
- ??? }
- }
- static struct kprobe kp_read_cache_func = {
- ??? //buffer io read读取文件页page数据执行到 copy_folio_to_iter()
- ??? .symbol_name??? = "copy_page_to_iter",
- };
- kp_read_cache_func.post_handler = kprobe_handler_post;
- //注册kprobe函数
- ret = register_kprobe(&kp_read_cache_func);
但是偶发触发内核异常告警,如下:
- [ 2606.173350] WARNING: CPU: 3 PID: 3347 at arch/x86/kernel/fpu/core.c:60 irq_fpu_usable+0x39/0x50
- [ 2606.173420]? fuse [last unloaded: async_memory_reclaim_for_cold_file_area_debug]
- [ 2606.173423] CPU: 3 PID: 3347 Comm: bash Kdump: loaded Tainted: G?????????? OE??? -------- h---? 5.14.0-284.11.1.el9_2.x86_64 #1
- [ 2606.173425] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
- [ 2606.173426] RIP: 0010:irq_fpu_usable+0x39/0x50
- [ 2606.173432] RSP: 0018:ffffb6fe009076e8 EFLAGS: 00010006
- [ 2606.173434] RAX: ffffffffa30c6794 RBX: ffffffffa30c6740 RCX: 0000000000000024
- [ 2606.173435] RDX: 0000000080110002 RSI: ffffffffa30c6770 RDI: ffffffffa30c6740
- [ 2606.173435] RBP: ffffffffa30c6770 R08: 0000000000000004 R09: 0000000000000004
- [ 2606.173436] R10: ffffffffa30c6794 R11: 00000000000319c0 R12: 0000000000000001
- [ 2606.173437] R13: ffffffffa30c6740 R14: 0000000000000024 R15: fffffc6140046780
- [ 2606.173438] FS:? 00007f6c674d4740(0000) GS:ffff971e39ec0000(0000) knlGS:0000000000000000
- [ 2606.173461] CS:? 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
- [ 2606.173462] CR2: 0000559a22f5dad0 CR3: 000000000c066004 CR4: 00000000003706e0
- [ 2606.173475]? blake2s_compress+0x20/0xa0
- [ 2606.173480]? blake2s_final+0x41/0x80
- [ 2606.173483]? extract_entropy.constprop.0+0x94/0x250
- [ 2606.173493]? crng_make_state+0x129/0x180
- [ 2606.173495]? _get_random_bytes.part.0+0x4c/0x190
- [ 2606.173497]? ? __alloc_pages+0xe6/0x230
- [ 2606.173499]? get_random_u32+0x59/0x90
- [ 2606.173501]? allocate_slab+0x14a/0x460
- [ 2606.173505]? ___slab_alloc+0x44f/0x570
- [ 2606.173509]? ? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
- [ 2606.173513]? ? find_busiest_group+0x65/0x190
- [ 2606.173516]? ? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
- [ 2606.173519]? kmem_cache_alloc+0x294/0x300
- [ 2606.173522]? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
- [ 2606.173526]? ? 0xffffffffc0572011
- [ 2606.173528]? kprobe_post_process+0x25/0x70
- [ 2606.173531]? ? 0xffffffffc0572012
- [ 2606.173532]? kprobe_int3_handler+0x154/0x180
- [ 2606.173534]? do_int3+0x3b/0x80
- [ 2606.173536]? exc_int3+0x81/0xc0
- [ 2606.173546] RSP: 0018:ffffb6fe00907bf0 EFLAGS: 00000246
- [ 2606.173548] RBP: 0000000000000000 R08: 0000000000000402 R09: ffff971e10b97500
- [ 2606.173549] R10: ffffb6fe00907d98 R11: ffffffffffffffff R12: ffffb6fe00907d38
- [ 2606.173550] R13: ffffb6fe00907d60 R14: 0000000000000100 R15: 0000000000000000
- [ 2606.173553]? ? 0xffffffffc0572012
- [ 2606.173554]? filemap_read+0x18a/0x320
- ...........................
显然是irq_fpu_usable()函数里触发的问题。为了查清楚根因,/proc/sys/kernel/panic_on_warn置1。下次触发这个内核告警后,触发了内核crash,并保存了vmcoe。通过vmcore看到出问题时完整函数流程:
- crash> bt
- PID: 33???? TASK: ffff971d01358000? CPU: 3??? COMMAND: "bash"
- ?#0 [ffffb6fe00907400] machine_kexec at ffffffffa126c237
- ?#1 [ffffb6fe00907458] __crash_kexec at ffffffffa13c3c9a
- ?#2 [ffffb6fe00907518] panic at ffffffffa1cd7dc6
- ?#3 [ffffb6fe00907598] __warn.cold at ffffffffa1cd7feb
- ?#4 [ffffb6fe009075d0] report_bug at ffffffffa17ac20e
- ?#5 [ffffb6fe00907608] handle_bug at ffffffffa1d254bc
- ?#6 [ffffb6fe00907618] exc_invalid_op at ffffffffa1d25684
- ?#7 [ffffb6fe00907630] asm_exc_invalid_op at ffffffffa1e00af6
- ??? [exception RIP: irq_fpu_usable+57]
- ??? RIP: ffffffffa1232d29? RSP: ffffb6fe009076e8? RFLAGS: 00010006
- ??? RAX: ffffffffa30c6794? RBX: ffffffffa30c6740? RCX: 0000000000000024
- ??? RDX: 0000000080110002? RSI: ffffffffa30c6770? RDI: ffffffffa30c6740
- ??? RBP: ffffffffa30c6770?? R8: 0000000000000004?? R9: 0000000000000004
- ??? R10: ffffffffa30c6794? R11: 00000000000319c0? R12: 0000000000000001
- ??? R13: ffffffffa30c6740? R14: 0000000000000024? R15: fffffc6140046780
- ??? ORIG_RAX: ffffffffffffffff? CS: 0010? SS: 0018
- ?#8 [ffffb6fe009076e8] blake2s_compress at ffffffffa12e5ae0
- ?#9 [ffffb6fe00907718] blake2s_final at ffffffffa176e391
- #10 [ffffb6fe00907738] extract_entropy.constprop.0 at ffffffffa18e2904
- #11 [ffffb6fe009077d8] crng_reseed.constprop.0 at ffffffffa18e2b25
- #12 [ffffb6fe00907818] crng_make_state at ffffffffa18e2dd9
- #13 [ffffb6fe00907848] _get_random_bytes at ffffffffa18e2e7c
- #14 [ffffb6fe009078f8] get_random_u32 at ffffffffa18e3129
- #15 [ffffb6fe00907918] allocate_slab at ffffffffa1577bea
- #16 [ffffb6fe00907960] ___slab_alloc at ffffffffa157b17f
- #17 [ffffb6fe00907a00] kmem_cache_alloc at ffffffffa157d194
- #18 [ffffb6fe00907a48] hot_file_update_file_status at ffffffffc0dd9d24 [async_memory_reclaim_for_cold_file_area]
- #19 [ffffb6fe00907aa8] kprobe_post_process at ffffffffa126ec65
- #20 [ffffb6fe00907ab8] kprobe_int3_handler at ffffffffa126ef44
- #21 [ffffb6fe00907ae8] do_int3 at ffffffffa1225b2b
- #22 [ffffb6fe00907af8] exc_int3 at ffffffffa1d25f51
- #23 [ffffb6fe00907b10] asm_exc_int3 at ffffffffa1e00b35
- #24 [ffffb6fe00907b98] copy_page_to_iter at ffffffffa1764d32
- #25 [ffffb6fe00907bf8] filemap_read at ffffffffa14d7b9a
- ..................
触发问题的函数流程是,kprobe捕捉内核函数copy_page_to_iter。在读文件时执行到该函数时,产生int3中断,在中断服务函数里最后执行自定义的hot_file_update_file_status()函数,在该函数里执行kmem_cache_alloc()分配一个自定义slab数据结构。而在分配slab时,执行allocate_slab->shuffle_freelist->get_random_int分配一个随机数,源码如下:
- static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
- {
- ??? void *start;
- ??? void *cur;
- ??? void *next;
- ??? unsigned long idx, pos, page_limit, freelist_count;
- ??? if (page->objects < 2 || !s->random_seq)
- ??????? return false;
- ??? freelist_count = oo_objects(s->oo);
- ??? //分配随机数,这里触发内核告警
- ??? pos = get_random_int() % freelist_count;
- ??? ............
- ??? next = next_freelist_entry(s, page, &pos, start, page_limit,freelist_count);
- ??? set_freepointer(s, cur, next);
- ??? ............
- }
为什么分配slab时要分配随机数呢?查资料说是为了使分配的slab obj地址随机,不容易被攻击,安全考虑。
把这个过程的函数流程再整理一下hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int->get_random_u32->_get_random_bytes->crng_make_state->crng_reseed->drain_entropy->extract_entropy->blake2s_final->__blake2s_final->blake2s_compress->kernel_fpu_begin->irq_fpu_usable。再看下最后的几个关键blake2s_compress函数调用:
- void blake2s_compress(struct blake2s_state *state, const u8 *block,
- ?????????? ???size_t nblocks, const u32 inc)
- {
- ??? kernel_fpu_begin();
- ..................
- ??? kernel_fpu_end();
- }
- static inline void kernel_fpu_begin(void)
- {
- ??? kernel_fpu_begin_mask(KFPU_MXCSR);
- }
- void kernel_fpu_begin_mask(unsigned int kfpu_mask)
- {
- ??? preempt_disable();
- ??? //这里触发内核告警
- ??? WARN_ON_FPU(!irq_fpu_usable());
- ??? WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
- ??? this_cpu_write(in_kernel_fpu, true);
- ??? ..................
- }
是irq_fpu_usable()函数返回false触发的crash,看下这个函数的源码
- bool irq_fpu_usable(void)
- {
- ??? if (WARN_ON_ONCE(in_nmi()))
- ??????? return false;
- ??? if (this_cpu_read(in_kernel_fpu))
- ??????? return false;
- ??? if (!in_hardirq())
- ??????? return true;
- ??? return !softirq_count();
- }
显然,in_nmi()、this_cpu_read(in_kernel_fpu)、softirq_count()大于0都可能会导致该函数返回false,得一一排查。
crash时的cpu是cpu3,很快查找cpu3的in_kernel_fpu这个per cpu 变量是0,
- crash> in_kernel_fpu
- PER-CPU DATA TYPE:
- ? bool in_kernel_fpu;
- PER-CPU ADDRESSES:
- ? [0]: ffff971e39e18ed8
- ? [1]: ffff971e39e58ed8
- ? [2]: ffff971e39e98ed8
- ? //cpu3的in_kernel_fpu 变量地址
- ? [3]: ffff971e39ed8ed8
- crash> rd? 0xffff971e39ed8ed8
- ffff971e39ed8ed8:? 0000000000000000??????????????????? ........? ??
in_nmi() 和 softirq_count() 得查看cpu3的preempt_count变量,是0x 80110003,如下:
- crash> __preempt_count
- PER-CPU DATA TYPE:
- ? int __preempt_count;
- PER-CPU ADDRESSES:
- ? [0]: ffff971e39e18f00
- ? [1]: ffff971e39e58f00
- ? [2]: ffff971e39e98f00
- ? //cpu3的preempt_count变量地址
- ? [3]: ffff971e39ed8f00
- crash> rd -x? ffff971e39ed8f00
- ffff971e39ed8f00:? 0000000080110003?
然后查看in_nmi()和softirq_count(),全都来自preempt_count()函数。
- #define PREEMPT_NEED_RESCHED??? 0x80000000
- static __always_inline int preempt_count(void)
- {
- ??? return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
- }
- #define PREEMPT_BITS??? 8
- #define PREEMPT_SHIFT?? 0
- #define SOFTIRQ_BITS? ??8
- #define SOFTIRQ_SHIFT?? (PREEMPT_SHIFT + PREEMPT_BITS)//8
- #define __IRQ_MASK(x)?? ((1UL << (x))-1)
- #define HARDIRQ_SHIFT?? (SOFTIRQ_SHIFT + SOFTIRQ_BITS)//16
- #define HARDIRQ_BITS??? 4
- ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?// ((1 << 4) - 1) << 16? 0xF << 16
- #define HARDIRQ_MASK??? (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
- ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??//preempt_count 的 bit16~bit19
- #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
- ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??//((1 << 8) - 1) << 8??? 0x7F << 8
- #define SOFTIRQ_MASK??? (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
- ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??//preempt_count 的 bit8~bit15
- # define softirq_count()??? (preempt_count() & SOFTIRQ_MASK)
- #define NMI_SHIFT?? (HARDIRQ_SHIFT + HARDIRQ_BITS)//20
- #define NMI_BITS??? 4
- ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? //((1 << 4) - 1) << 20?? 0xF << 20
- #define NMI_MASK??? (__IRQ_MASK(NMI_BITS)???? << NMI_SHIFT)
- ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??//preempt_count 的 bit20~bit23
- #define nmi_count() (preempt_count() & NMI_MASK)
- #define in_nmi()??????? (nmi_count())
cpu3的preempt_count()值是 0x80110003 , bit8~bit15 是0,bit16~bit19是1,bit20~bit23是1。即 in_nmi() 是1,in_hardirq()是1,softirq_count()是0。显然,问题根源竟然是 in_nmi() 是非0导致的。这个就是NMI计数,非0说明当前处于NMI中断。这怎么可能呢?怎么会处于NMI中断呢?什么是NMI中断,这个一般是当发生hard lock、cpu硬件级别错误时才会发生的不可屏蔽中断。可是这个问题场景只是我利用kprobe捕捉内核函数,然后产生int3中断,在中断服务函数里最后执行自定义的hot_file_update_file_status()函数,在该函数里执行kmem_cache_alloc()分配一个slab数据结构而已,这个普通场景怎么会触发NMI中断呢?
这个虚拟机是rocky 9.2 ,之前就出现过莫名奇妙的问题,怀疑内存数据有问题,难道这次又是内存数据有问题?当遇到莫名其妙的问题,就容易怀疑到内存有问题导致数据跳变、内存越界等导致?哪里会有这么多神奇的行为,还是先耐着性子看下源码吧。于是发现in_nmi()大于0是执行了 __nmi_enter()导致的,如下:
- #define __nmi_enter()?????????????????????? \
- ??? do {??????????????????????????? \
- ??????? lockdep_off();????????????????? \
- ??????? arch_nmi_enter();?????????????? \
- ??????? BUG_ON(in_nmi() == NMI_MASK);?????????? \
- ??????? __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);?? \
- ??? } while (0)
再进一步,发现int3中断里竟然在irqentry_nmi_enter()函数里执行了__nmi_enter,细节如下:
- irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
- {
- ??? irqentry_state_t irq_state;
- ??? irq_state.lockdep = lockdep_hardirqs_enabled();
- ??? __nmi_enter();//令 in_nmi 加1
- ??? lockdep_hardirqs_off(CALLER_ADDR0);
- ??? lockdep_hardirq_enter();
- ??? rcu_nmi_enter();
- ??? instrumentation_begin();
- ??? trace_hardirqs_off_finish();
- ??? ftrace_nmi_enter();
- ??? instrumentation_end();
- ??? return irq_state;
- }
- DEFINE_IDTENTRY_RAW(exc_int3)
- {
- ??? if (poke_int3_handler(regs))
- ??????? return;
- ? ?if (user_mode(regs)) {
- ? ? ? ?……………
- ??? } else {
- ??????? //令 in_nmi 加1
- ??????? irqentry_state_t irq_state = irqentry_nmi_enter(regs);
- ??????? instrumentation_begin();
- ??????? //执行kprobe 的函数
- ??????? if (!do_int3(regs))
- ??????????? die("int3", regs, 0);
- ??????? instrumentation_end();
- ??????? //令 in_nmi 减1
- ??????? irqentry_nmi_exit(regs, irq_state);
- ??? }
- }
- static bool do_int3(struct pt_regs *regs)
- {
- ??? int res;
- #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
- ??? if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
- ???????????? SIGTRAP) == NOTIFY_STOP)
- ??????? return true;
- #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
- #ifdef CONFIG_KPROBES
- ??? //执行kprobe 的函数
- ??? if (kprobe_int3_handler(regs))
- ??????? return true;
- #endif
- ??? res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
- ??? return res == NOTIFY_STOP;
- }
简单说:从源码角度来说,每次kprobe触发的int3中断,在exc_int3()中执行kprobe hot_file_update_file_status函数前,都会先执行 irqentry_nmi_enter->__nmi_enter 令NMI中断加1。这样看到的,每次执行 hot_file_update_file_status->kmem_cache_alloc 函数分配slab 前,都会出发内核告警irq_fpu_usable。可实际测试是随机的?这是怎么回事?
我认为 hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int->get_random_u32->_get_random_bytes->crng_make_state->crng_reseed->drain_entropy->extract_entropy->blake2s_final->__blake2s_final->blake2s_compress->kernel_fpu_begin->irq_fpu_usable,这个流程并不是每次都执行到最后的 irq_fpu_usable ,中间有很多条件判断,但是一旦执行到irq_fpu_usable ,必然会触发内核告警。
简单说,kprobe的函数里,不能执行 kmem_cache_alloc()分配slab 内存。因为kprobe的init3中断会令NMI计数加1,然后执行kmem_cache_alloc()分配slab有概率执行到irq_fpu_usable,因为NMI计数大于0而触发内核告警。
怎么解决呢?
- 1:在执行 hot_file_update_file_status->kmem_cache_alloc 前,自己的源码里强制令nmi计数减1,但怕对系统会有不良影响。
- 2:自己实现一套简易的slab源码,不执行 get_random_int() 函数就行。
但是这个两个方案改动太大,研究源码后,其实有个很简单的方法,不用动内核。
禁止掉 allocate_slab random 功能,就不会执行 hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int 函数,具体改动如下:
- struct hot_cold_file_global
- {
- ??? struct kmem_cache *file_stat_cachep;
- }
- struct hot_cold_file_global hot_cold_file_global_info;
- static void (*cache_random_seq_destroy)(struct kmem_cache *cachep);
- //获取内核cache_random_seq_destroy函数指针
- cache_random_seq_destroy = (void *)kallsyms_lookup_name_async("cache_random_seq_destroy");
- cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);
- //禁止分配 file_stat_cachep? 这个slab时使用随机数功能,分配slab时就不会执行到shuffle_freelist->get_random_int函数
- cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);
当执行cache_random_seq_destroy()函数后,就令slab的random_seq为NULL
- void cache_random_seq_destroy(struct kmem_cache *cachep)
- {
- ??? kfree(cachep->random_seq);
- ??? cachep->random_seq = NULL;
- }
然后将来分配slab,执行到allocate_slab->shuffle_freelist时,直接返回false,不会再执行到get_random_int()函数。
- static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
- {
- ??? void *start;
- ??? void *cur;
- ??? void *next;
- ??? unsigned long idx, pos, page_limit, freelist_count;
- ? ?? //直接return false
- ??? if (page->objects < 2 || !s->random_seq)
- ??????? return false;
- ??? freelist_count = oo_objects(s->oo);
- ??? //分配随机数,这里触发crash
- ??? pos = get_random_int() % freelist_count;
- ??? ............
- }