内核block层开发时遇到的各种变量同步问题总结

发布时间:2024年01月01日

本文是我在开发一个内核模块(统计进程级的IO派发延迟)时,遇到的一系列并发问题总结。这个内核模块的详细功能在《一次无语的内核调试经历(内核卡死、内核内存越界、spin lock锁异常)》开头第1节有详细介绍,希望读者先看下,本文不再介绍。

这个内核模块的基本功能是:在IO请求(简称为rq或者req)插入IO队列blk_mq_sched_request_inserted函数记录rq插入IO队列的时间点,在IO请求派发函数blk_mq_dispatch_rq_list记录rq真正派发给磁盘驱动的时间点,在IO请求传输完成执行的函数blk_account_io_done计算IO请求在磁盘驱动层的传输耗时

  1. void blk_mq_sched_request_inserted(struct request *rq)
  2. {
  3. ??? if(rq->rq_disk && rq->rq_disk->process_io.enable){
  4. ??????? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
  5. ??????? struct process_io_info *p_process_io_info_tmp = NULL;
  6. ??????? ………………….
  7. ? ? ? ?//为每个rq分配一个process_io_info
  8. ??????? p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_ATOMIC);
  9. ??????? memset(p_process_io_info_tmp,0,sizeof(struct process_io_info));
  10. ??????? //为每个派发IO的进程分配一个process_io_stat
  11. ??????? p_process_rq_stat_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_rq_stat_cachep,GFP_ATOMIC);
  12. ??????? memset(p_process_rq_stat_tmp,0,sizeof(struct process_rq_stat))
  13. ??????? //记录rq所属进程pid及名字
  14. ?????? ?p_process_io_info_tmp->pid = current->pid;
  15. ??????? strncpy(p_process_io_info_tmp->comm,current->comm,COMM_LEN-1);
  16. ???????
  17. ??????? p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp;
  18. ??????? smp_mb();
  19. ??????? //记录rq插入IO队列的时间点
  20. ??????? p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get());
  21. ??????? p_process_rq_stat_tmp->rq = rq;
  22. ??????? rq->p_process_rq_stat = p_process_rq_stat_tmp;
  23. ??
  24. ???????? spin_lock_irq(&(rq->rq_disk->process_io.process_io_insert_lock));??????? list_add(&rq->p_process_rq_stat->process_io_insert,&(rq->rq_disk->process_io.process_io_insert_head));
  25. ??????? spin_unlock_irq(&(rq->rq_disk->process_io.process_io_insert_lock));
  26. ??????? return;
  27. ??? }
  28. }
  29. bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
  30. ???????????????? bool got_budget)
  31. {
  32. ??? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
  33. ??? struct process_io_info *p_process_io_info_tmp = NULL;
  34. ??? ...............
  35. ??? ret = q->mq_ops->queue_rq(hctx, &bd);
  36. ??? ...............
  37. ??? p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
  38. ??? p_process_rq_stat_tmp = rq->p_process_rq_stat;
  39. ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info){
  40. ??????? //记录rq真正派发给磁盘驱动的时间点
  41. ??????? p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get());
  42. ??? }? ??
  43. }
  44. void blk_account_io_done(struct request *req, u64 now)
  45. {
  46. ??? .......................
  47. ??? if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
  48. ??????????? struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
  49. ??????????? struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
  50. ??????????? p_process_rq_stat_tmp->dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_issue_time;
  51. ??????????? p_process_rq_stat_tmp->idc_time = p_process_rq_stat_tmp->dc_time + p_process_rq_stat_tmp->id_time;
  52. ???????????
  53. ??????????? //计算IO请求在磁盘驱动层传输的真正耗时,并把最大的耗时保存到max_real_dc_time
  54. ?????????? ?p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time;
  55. ??????????? if( p_process_rq_stat_tmp->real_dc_time > p_process_io_info_tmp->max_real_dc_time){
  56. ??????????????? p_process_io_info_tmp->max_real_dc_time = p_process_rq_stat_tmp->real_dc_time;
  57. ??????????? }
  58. ??????????? req->p_process_rq_stat = NULL;
  59. ??? }
  60. ??? .......................
  61. }

在blk_mq_dispatch_rq_list函数执行q->mq_ops->queue_rq真正派发rq到磁盘驱动后,记录rq的真实派发时间:

  1. bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
  2. ???????????????? bool got_budget)
  3. {
  4. ??? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
  5. ??? struct process_io_info *p_process_io_info_tmp = NULL;
  6. ??? ...............
  7. ??? ret = q->mq_ops->queue_rq(hctx, &bd);
  8. ??? ...............
  9. ??? p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
  10. ??? p_process_rq_stat_tmp = rq->p_process_rq_stat;
  11. ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info){
  12. ??????
  13. ????????? ??//p_process_rq_stat_tmp->rq_real_issue_time 0说明是无效的
  14. ??????????? if(p_process_rq_stat_tmp->rq_real_issue_time == 0){
  15. ??????????????? spin_lock_irq(&(p_process_io_info_tmp->io_data_lock));
  16. ??????????????? //计算rq的真实派发时间
  17. ??????????????? p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get());
  18. ??????????????? spin_unlock_irq(&(p_process_io_info_tmp->io_data_lock));
  19. ??????????? }
  20. ??? }
  21. }

在print_process_io_info函数打印IO延迟参数,但遇到如下异常打印:

  • 1:打印的 max_real_dc_time 非常大
  • kworker/3:0 6202 rq_count:1 io_size:0M max_id_time:9us max_dc_time:693us max_idc_time:702us max_real_dc_time:1575904706us max_hctx_list_rq:0 rq_inflght_issue:0_1 rq_inflght_done:0_1? avg_id_time:9us avg_dc_time:693us avg_idc_time:702us
  • 2:打印的 max_real_dc_time 是负数
  • fio 6386 rq_count:1083 io_size:4M max_id_time:61199us max_dc_time:3106us max_idc_time:62114us max_real_dc_time:-2075967423us max_hctx_list_rq:0 rq_inflght_issue:219_28 rq_inflght_done:221_31? avg_id_time:9774us avg_dc_time:1391us avg_idc_time:11165us

为什么会出现以上问题呢?在把IO请求插入IO运行队列时会对struct process_rq_stat *p_process_rq_stat_tmp清0。执行到blk_mq_dispatch_rq_list()里的if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info),正常rq这个IO请求还没派发完成,此时执行blk_account_io_done()里的p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time计算的real_dc_time是准确的。

但是也可能已经派发完成并执行了 blk_account_io_done()里的计算real_dc_time的代码,此时p_process_rq_stat_tmp->rq_real_issue_time还没在blk_mq_dispatch_rq_list函数中赋值,real_dc_time还是初值0。此时计算的real_dc_time是有问题的,就会” max_real_dc_time:1575904706us”这么大。并且这个rq很快又被新的进程分配,并分配rq->p_process_rq_stat。接着才执行到blk_mq_dispatch_rq_list函数这里,错误执行p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())。而这个rq被赋值了新的进程,只是插入IO队列,还没有派发!简单说rq->p_process_rq_stat已经是新分配的了。等这个rq接下来真的被派发,执行到blk_mq_dispatch_rq_list函数就会发现p_process_rq_stat_tmp->rq_real_issue_time不是0。这种情况就需要blk_mq_dispatch_rq_list函数中在ret = q->mq_ops->queue_rq(hctx, &bd)派发rq给磁盘驱动前后,判断rq->p_process_rq_stat所属的进程是不是变了。

那为什么max_real_dc_time会是负数呢?执行到blk_mq_dispatch_rq_list函数派发IO后返回但还没执行p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())时,rq正好派发完成执行blk_account_io_done()函数,rq->p_process_rq_stat还没清NULL,因为blk_mq_dispatch_rq_list函数里的if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat)成立,里边的p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())和 blk_account_io_done()里的p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get())-p_process_rq_stat_tmp->rq_real_issue_time就会同时执行,谁前谁后不一定。

这样就可能blk_account_io_done()里先ktime_to_us(ktime_get()),然后blk_mq_dispatch_rq_list函数执行p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())对p_process_rq_stat_tmp->rq_real_issue_time赋值,此时blk_account_io_done函数执行的p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time就可能是负数。这种情况需要加锁保护,绝对保证两个函数对rq_real_issue_time的使用或赋值,同时只有一个进程在进行。

根据以上两种情况,这样修改源码,并添加一些调试信息,下文用到:

  1. bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
  2. ???????????????? bool got_budget)
  3. {
  4. ??? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
  5. ??? struct process_io_info *p_process_io_info_tmp = NULL;
  6. ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
  7. ??????? //先保存rq所属进程PID
  8. ??????? rq_pid = rq->p_process_rq_stat->p_process_io_info->pid;
  9. ??????? printk("1:%s %s %d\n",__func__,current->comm,current->pid);
  10. ??? }
  11. ??? ...............
  12. ??? ret = q->mq_ops->queue_rq(hctx, &bd);
  13. ??? ...............
  14. ??? p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
  15. ??? p_process_rq_stat_tmp = rq->p_process_rq_stat;
  16. ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info){
  17. ??????? printk("2:%s %s %d\n",__func__,current->comm,current->pid);
  18. ??????? //派发rq前后rq所属进程必须是同一个
  19. ??????? if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid)){
  20. ??????????? //p_process_rq_stat_tmp->rq_real_issue_time 0说明是无效的,舍弃
  21. ??????????? if(p_process_rq_stat_tmp->rq_real_issue_time == 0){
  22. ??????????????? spin_lock_irq(&(p_process_io_info_tmp->io_data_lock));
  23. ??????????????? p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get());
  24. ??????????????? spin_unlock_irq(&(p_process_io_info_tmp->io_data_lock));
  25. ??????????? }
  26. ???? ???????else
  27. ??????????? {
  28. ??????????????? printk(KERN_DEBUG"%s rq_real_issue_time:%llu rq_issue_time:%llu rq_inset_time:%llu p_process_io_info_tmp:%p\n",__func__,p_process_rq_stat_tmp->rq_real_issue_time,p_process_rq_stat
  29. _tmp->rq_issue_time,p_process_rq_stat_tmp->rq_inset_time,p_process_io_info_tmp);
  30. ??????????? }
  31. ??????? }
  32. ??? }
  33. }
  34. void blk_account_io_done(struct request *req, u64 now)
  35. {
  36. ??? .......................
  37. ??? if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
  38. ??????????? struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
  39. ??????????? struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
  40. ??????????? printk("%s rq:0x%llx process_rq_stat:0x%llx p_process_io_info_tmp:0x%llx pid:%d\n",__func__,(u64)req,(u64)(req->p_process_rq_stat),(u64)p_process_io_info_tmp,p_process_io_info_tmp->pid);
  41. ??????????? p_process_rq_stat_tmp->dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_issue_time;
  42. ??????????? p_process_rq_stat_tmp->idc_time = p_process_rq_stat_tmp->dc_time + p_process_rq_stat_tmp->id_time;
  43. ???????????
  44. ??????????? spin_lock(&(p_process_io_info_tmp->io_data_lock));
  45. ??????????? //计算IO请求在磁盘驱动层传输的真正耗时,并把最大的耗时保存到max_real_dc_time
  46. ??????????? p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time;
  47. ??????????? if( p_process_rq_stat_tmp->real_dc_time > p_process_io_info_tmp->max_real_dc_time){
  48. ??????????????? p_process_io_info_tmp->max_real_dc_time = p_process_rq_stat_tmp->real_dc_time;
  49. ??????????? }
  50. ??????????? rq->p_process_rq_stat = NULL;
  51. ??????????? spin_unlock(&(p_process_io_info_tmp->io_data_lock));
  52. ??? }
  53. ??? .......................
  54. }

blk_mq_sched_request_inserted函数源码再贴下:

  1. void blk_mq_sched_request_inserted(struct request *rq)
  2. {
  3. ??? if(rq->rq_disk && rq->rq_disk->process_io.enable){
  4. ??????? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
  5. ??????? struct process_io_info *p_process_io_info_tmp = NULL;
  6. ???????
  7. ??????? p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_ATOMIC);
  8. ??????? memset(p_process_io_info_tmp,0,sizeof(struct process_io_info));
  9. ??????? p_process_rq_stat_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_rq_stat_cachep,GFP_ATOMIC);
  10. ??????? memset(p_process_rq_stat_tmp,0,sizeof(struct process_rq_stat))
  11. ??????? p_process_io_info_tmp->pid = current->pid;
  12. ??????? strncpy(p_process_io_info_tmp->comm,current->comm,COMM_LEN-1);
  13. ???????
  14. ??????? p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp;
  15. ??????? smp_mb();
  16. ??????? p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get());
  17. ??????? p_process_rq_stat_tmp->rq = rq;
  18. ??????? rq->p_process_rq_stat = p_process_rq_stat_tmp;
  19. ??
  20. ??????? spin_lock_irq(&(rq->rq_disk->process_io.process_io_insert_lock));
  21. ??????? /*为什么不直接把rq添加到process_io_insert_head链表,而是把rq->p_process_rq_stat添加到process_io_insert_head链表。这是因为可能在IO传输完成执行blk_account_io_done()后可能会释放掉rq。然后print_process_io_info()中从process_io_insert_head遍历到这个被释放的rq,使用rq->p_process_rq_stat->rq_inset_time就有问题了,因为rq已经失效了*/
  22. ??????? list_add(&rq->p_process_rq_stat->process_io_insert,&(rq->rq_disk->process_io.process_io_insert_head));
  23. ??????? spin_unlock_irq(&(rq->rq_disk->process_io.process_io_insert_lock));
  24. ???????
  25. ??????? printk("%s rq:0x%llx process_rq_stat:0x%llx rq_inset_time:%lld? p_process_io_info_tmp:0x%llx pid:%d rq_real_issue_time:%lld\n",__func__,(u64)rq,(u64)(rq->p_process_rq_stat),p_proce
  26. ss_rq_stat_tmp->rq_inset_time,(u64)p_process_io_info_tmp,p_process_io_info_tmp->pid,p_process_rq_stat_tmp->rq_real_issue_time);
  27. ??????? return;
  28. ??? }
  29. }

blk_mq_sched_request_inserted函数有两点需要注意:

  • 1:在 p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp 和 p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get())之间加了smp_mb()内存屏障,这是为了保证两个赋值的先后顺序。p_process_rq_stat_tmp->p_process_io_info表示该rq已经与进程绑定了,有了这个赋值才能使用p_process_rq_stat_tmp->rq_inset_time、p_process_rq_stat_tmp->rq等信息。
  • 2:用的list_add(&rq->p_process_rq_stat->process_io_insert,&(rq->rq_disk->process_io.process_io_insert_head))而不是list_add(&rq-> process_io_insert,&(rq->rq_disk->process_io.process_io_insert_head)),原因是:因为可能在IO传输完成执行blk_account_io_done()后可能会释放掉rq。然后print_process_io_info()中从process_io_insert_head遍历到这个被释放的rq,使用rq->p_process_rq_stat->rq_inset_time就有问题了,因为rq已经失效了。rq的分配和释放我不能控制,但是rq->p_process_rq_stat的分配和释放我可以控制!在blk_account_io_done函数中,先使用rq->p_process_rq_stat信息,再释放,防止丢失IO采集信息。

继续测试,又来了新的问题:压测时blk_mq_dispatch_rq_list函数会有打印如下:

[ 5071.063712] blk_mq_dispatch_rq_list rq:0xffff9e0ca34be4d0 process_rq_stat:0xffff9e0caba03b40 rq_real_issue_time:5070726879 p_process_io_info_tmp:0xffff9e0c77b086e8 pid:7129

就是说,blk_mq_dispatch_rq_list函数中if(p_process_rq_stat_tmp->rq_real_issue_time == 0)总是不成立,而走else分支。这是不合理的,我已经做了各种防护,为什么p_process_rq_stat_tmp->rq_real_issue_time不是0呢?不是0就说明p_process_rq_stat_tmp->rq_real_issue_time已经被其他进程赋值过了!这个很不合理,但是奇葩情况见多了,解决这种奇葩的问题,就要用最简单的问题。于是在IO请求插入IO算法队列的blk_mq_sched_request_inserted函数、IO派发给磁盘驱动执行的blk_mq_start_request和blk_mq_dispatch_rq_list函数、IO请求传输完成执行的blk_account_io_done函数,添加printk打印rq、p_process_rq_stat_tmp、p_process_rq_stat_tmp->rq_real_issue_time等信息。这些调试信息,前文列举源码时已经添加,这里只用再列下 blk_mq_start_request的:

  1. void blk_mq_start_request(struct request *rq)? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
  2. {
  3. ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
  4. ??????? struct process_rq_stat *p_process_rq_stat_tmp = rq->p_process_rq_stat;
  5. ??????? struct process_io_info *p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
  6. ???????
  7. ??????? p_process_rq_stat_tmp->rq_issue_time = ktime_to_us(ktime_get());? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??
  8. ??????? p_process_rq_stat_tmp->id_time = p_process_rq_stat_tmp->rq_issue_time - p_process_rq_stat_tmp->rq_inset_time;
  9. ??????? printk("%s %s %d rq:0x%llx process_rq_stat:0x%llx rq_issue_time:%lld p_process_io_info_tmp:0x%llx pid:%d? rq_real_issue_time:%lld\n",__func__,current->comm,current->pid,(u64)rq,(u64)(rq->p_process_rq_stat),p_process_rq_stat_tmp->rq_issue_time,(u64)p_process_io_info_tmp,p_process_io_info_tmp->pid,p_process_rq_stat_tmp->rq_real_issue_time);
  10. ??? }
  11. }

实际调试下来,这个并发问题简直离谱到家了!问题的根源还是在blk_mq_dispatch_rq_list函数,再看下它的源码:

  1. bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
  2. ???????????????? bool got_budget)
  3. {
  4. ??? struct process_rq_stat *p_process_rq_stat_tmp = NULL;
  5. ??? struct process_io_info *p_process_io_info_tmp = NULL;
  6. ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
  7. ??????? //先保存rq所属进程PID
  8. ??????? rq_pid = rq->p_process_rq_stat->p_process_io_info->pid;
  9. ??????? printk("1:%s %s %d\n",__func__,current->comm,current->pid);
  10. ??? }
  11. ...............
  12. ? ?//派发rq到磁盘驱动
  13. ??? ret = q->mq_ops->queue_rq(hctx, &bd);
  14. ??? ...............
  15. ??? p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
  16. ??? p_process_rq_stat_tmp = rq->p_process_rq_stat;
  17. ??? if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat && rq->p_process_rq_stat->p_process_io_info){
  18. ??????? printk("2:%s %s %d\n",__func__,current->comm,current->pid);
  19. ??????? //派发rq前后rq所属进程必须是同一个
  20. ??????? if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid)){
  21. ??????????? //p_process_rq_stat_tmp->rq_real_issue_time 0说明是无效的,舍弃
  22. ??????????? if(p_process_rq_stat_tmp->rq_real_issue_time == 0){
  23. ??????????????? spin_lock_irq(&(p_process_io_info_tmp->io_data_lock));
  24. ??????????????? p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get());
  25. ??????????????? spin_unlock_irq(&(p_process_io_info_tmp->io_data_lock));
  26. ??????????? }
  27. ??????????? else
  28. ??????????? {
  29. ??????????????? printk(KERN_DEBUG"%s rq_real_issue_time:%llu rq_issue_time:%llu rq_inset_time:%llu p_process_io_info_tmp:%p\n",__func__,p_process_rq_stat_tmp->rq_real_issue_time,p_process_rq_stat
  30. _tmp->rq_issue_time,p_process_rq_stat_tmp->rq_inset_time,p_process_io_info_tmp);
  31. ??????????? }
  32. ??????? }
  33. ??? }
  34. }

blk_mq_dispatch_rq_list函数中,首先rq_pid = rq->p_process_rq_stat->p_process_io_info->pid记录这个rq所属的进程pid,然后执行q->mq_ops->queue_rq(hctx, &bd)派发rq到磁盘驱动。接着执行if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid))判断rq所属进程是否变了,没有的话才会执行p_process_rq_stat_tmp->rq_real_issue_time = ktime_to_us(ktime_get())对 p_process_rq_stat_tmp->rq_real_issue_time赋值rq派发后的时间。

问题个关键点就是q->mq_ops->queue_rq(hctx, &bd) 和 if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid)) 之间极短的时间内,这个rq派发完成了,中断里执行blk_account_io_done释放掉了这个rq及它的process_io_info。然后又被同一个rq_pid进程立即分配了同一个rq,然后传输这个rq。在把rq插入到IO队列执行blk_mq_sched_request_inserted函数时,又为这个rq分配了同一个process_io_info!虽然难以置信,但这种情况完成可能成立的!你要考虑到,软中断是可以打断当前进程的。并且这是虚拟机测试环境,更容易发生。

同一个进程,同一个rq,同一个process_io_info!因此老的还在blk_mq_dispatch_rq_list函数里的进程,执行到q->mq_ops->queue_rq(hctx, &bd) 后,执行到 if(rq->p_process_rq_stat &&(rq_pid == rq->p_process_rq_stat->p_process_io_info->pid))时,这个if就是成立的,但是这个rqprocess_io_info却被其他进程分配走了,对当前进程来说是无效的。其他进程完全可以也在执行blk_mq_dispatch_rq_list函数,并对p_process_rq_stat_tmp->rq_real_issue_time赋值。因此这个当前进程就会发现if(p_process_rq_stat_tmp->rq_real_issue_time == 0)竟然不成立。

遇到离谱的事先不要动不动就怀疑内存有问题,内存越界!要先要静下心来把可能的情况耐心复盘一下,往往此时就发现契机了!并且,有些问题可能在物理机上测试很难发生,但是在虚拟机环境却很容易发生,还是要在尽可能多的环境测试!

最后,再说下其他变量同步问题:

blk_mq_sched_request_inserted函数中分配进程派发IO的分配的process_rq_stat和process_io_info信息,每次传输IO请求分配一个process_rq_stat结构,每个进程派发IO则分配一个process_io_info结构。process_io_info与进程绑定,process_rq_stat与IO请求rq绑定。IO请求派发给磁盘驱动执行的blk_mq_start_request函数、IO请求传输完成执行的blk_account_io_done函数中,计算IO请求传输的id和dc耗时,并保存到process_rq_stat和process_io_info结构体中。

print_process_io_info函数每隔1s采集一次所有派发IO进程的process_io_info信息,获取每个进程传输IO的最大id、dc、传输数据量、iops、在磁盘驱动最大IO数、在IO算法队列最大延迟等IO数据,然后printk打印出来,最后对process_io_info清0。如果此时process_io_info绑定的进程也在对process_io_info结构提成员赋值,那二者就存在数据同步问题。这种情况只能使用spin-lock解决,保证同时只有一个进程在使用或者修改process_io_info结构体信息。源码如下:

  1. void blk_account_io_done(struct request *req, u64 now)
  2. {
  3. ??? .......................
  4. ??? if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
  5. ??????????? struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
  6. ??????????? struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
  7. ??????????? p_process_rq_stat_tmp->dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_issue_time;
  8. ??????????? p_process_rq_stat_tmp->idc_time = p_process_rq_stat_tmp->dc_time + p_process_rq_stat_tmp->id_time;
  9. ??????????? ?//加锁防护
  10. ??????????? spin_lock(&(p_process_io_info_tmp->io_data_lock));
  11. ??????????? //计算IO请求在磁盘驱动层传输的真正耗时,并把最大的耗时保存到max_real_dc_time
  12. ??????????? p_process_rq_stat_tmp->real_dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_real_issue_time;
  13. ??????????? if( p_process_rq_stat_tmp->real_dc_time > p_process_io_info_tmp->max_real_dc_time){
  14. ??????????????? p_process_io_info_tmp->max_real_dc_time = p_process_rq_stat_tmp->real_dc_time;
  15. ??????????? }
  16. ??????????? rq->p_process_rq_stat = NULL;
  17. ??????????? spin_unlock(&(p_process_io_info_tmp->io_data_lock));
  18. ??? }
  19. ??? .......................
  20. }
  21. ……………….
  22. void print_process_io_info(struct process_io_control *p_process_io_tmp)
  23. {
  24. ??? struct process_io_info *p_process_io_info_tmp = NULL;
  25. ???
  26. ??? list_for_each_entry_rcu(p_process_io_info_tmp, &(p_process_io_tmp->process_io_control_head), process_io_info_list){
  27. ??????? if(p_process_io_info_tmp->complete_rq_count != 0){
  28. ??????????? //加锁防护
  29. ??????????? spin_lock_irq(&(p_process_io_info_tmp->io_data_lock));
  30. ??????????? .............
  31. ??????????? max_id_time = p_process_io_info_tmp->max_id_time;
  32. ??????????? max_dc_time = p_process_io_info_tmp->max_dc_time;
  33. ??????????? max_idc_time = p_process_io_info_tmp->max_idc_time;
  34. ???????????
  35. ??????????? p_process_io_info_tmp->max_id_time = 0;
  36. ??????????? p_process_io_info_tmp->max_dc_time = 0;
  37. ??????????? p_process_io_info_tmp->max_idc_time = 0;
  38. ??????????? spin_unlock_irq(&(p_process_io_info_tmp->io_data_lock));
  39. ??????????? printk打印 max_id_timemax_dc_timemax_idc_time 信息
  40. ??????? }else{
  41. ??????????? spin_lock_irq(&(p_process_io_tmp->process_lock_list));
  42. ??????????? list_del_rcu(&p_process_io_info_tmp->process_io_info_list);
  43. ?????????? ?spin_unlock_irq(&(p_process_io_tmp->process_lock_list));
  44. ??????? }
  45. ??? }
  46. }

明显,io_data_lock锁保证了blk_account_io_doneprint_process_io_info函数对process_io_info结构体成员的读取、赋值、清0都是独占的,不存在数据同步问题。

还有一个问题是,如果多个地方都需要使用spin lock锁,最好定义多个spin lock锁,防止单个spin lock临界区代码过多,比如blk_mq_sched_request_inserted函数和print_process_io_info中,使用process_io_insert_lock锁防止对process_io_info添加到process_io_insert_head链表、删除、遍历,进程是独占的,不能存在数据同步问题。代码前文已经列出。

文章来源:https://blog.csdn.net/hu1610552336/article/details/135315764
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。