Protoss

千金散尽还复来,莫使金樽空对月


« Solaris thread limit... | Main | NPort ID Virtualizat... »
星期六 三月 22, 2008

用kmdb调试系统panic一例

今天在和另一个驱动模块Q做联调的时候,系统莫明的一通狂panic,

以下是引起异常的thread的堆栈

panic[cpu0]/thread=2a102f91ca0: BAD TRAP: type=34 rp=2a102f910b0 addr=ffffcafebaddcbe6 mmu_fsr=0

sched: alignment error:
addr=0xffffcafebaddcbe6
pid=0, pc=0x7bea2f50, sp=0x2a102f90951, tstate=0xe2001600, context=0x0
g1-g7: 116dd9c, 189d800, 2043, 2000, 2000, 1c, 2a102f91ca0

000002a102f90dd0 unix:die+98 (34, 2a102f910b0, ffffcafebaddcbe6, 0, 10a0800, 2a102f90e90)
  %l0-3: 000000000180c000 0000000000000000 0000000000001a10 0000000001857cc0
  %l4-7: 0000000000000000 0000000000000000 0000000000000000 0000000000010009
000002a102f90eb0 unix:trap+688 (2a102f910b0, 180c000, e00000034, e, 10000, 0)
  %l0-3: 000000000180c000 0000000000000000 0000000000001a10 0000000001857cc0
  %l4-7: 0000000000000000 0000000000000000 0000000000000000 0000000000010009
000002a102f91000 unix:ktl0+64 (6002038b000, 7bee0d9e, 0, 0, 70072000, 70134400)
  %l0-3: 000000000180c000 0000000000000000 00000000e2001600 000000000101ec98
  %l4-7: 0000000000000000 0000000000000000 0000000000000000 000002a102f910b0
000002a102f91150 qlc:ql_init_pkt+24 (0, 60028109c00, baddcafebaddcafe, fca2200, fca2000, 70133000)
  %l0-3: 0000060028109c00 0000000000000001 0000000000000100 0000000000000100
  %l4-7: 0000000000000000 0000000000000000 0000000000000800 000000007bee0c00
000002a102f91200 fctl:fc_ulp_init_packet+18 (6002465c000, 60028109c00, 0, 0, 7bea2ed4, 6002026cd80)
  %l0-3: 0000060028109c00 0000000000000001 0000000000000100 0000000000000100
  %l4-7: 0000000000000000 000006002026cd80 0000000000000800 0000000000000000
000002a102f912b0 fp:fp_alloc_pkt+60 (6002465c000, 74, 74, 0, 0, 60028109bd8)
  %l0-3: 0000060028109c00 0000000000000001 0000000000000100 0000000000000100
  %l4-7: 0000000000000000 000006002026cd80 0000000000000800 0000000000000000
000002a102f913a0 fp:fp_attach_handler+4a8 (60020032d60, 1, 7b3d5000, 6002465c0b8, 6002465c0ba, 6002465c000)
  %l0-3: 000006002465c478 000000000007013c 0000000000000100 0000000000000100
  %l4-7: 0000000000000000 000006002026cd80 0000000000000800 0000000000000000
000002a102f914d0 genunix:devi_attach+a0 (60020032d60, 0, 7ffffc00, ffffffffffffffff, ffffffffffffffff, 7b3c0150)
  %l0-3: 00000000fffefc00 0000000040010000 0000000000000000 0000000040010000
  %l4-7: 0000000000010000 0000000000010000 0000000000000004 0000000000000108
000002a102f915a0 genunix:attach_node+98 (60020032d60, 1, 2, 60020032dc8, 8f8, 3000389e000)
  %l0-3: 00000000fffefc00 0000000040010000 0000000000000000 0000000040010000
  %l4-7: 0000000000010000 0000000000010000 0000000000000004 0000000000000108
000002a102f91650 genunix:i_ndi_config_node+104 (60020032d60, 6, 10f12e0, 10, 0, 1899400)
  %l0-3: 0000000001899768 0000000000000000 0000000001899400 00000300016144f0
  %l4-7: 0000000000000000 00000000010f1000 0000000000000004 0000000000000108
000002a102f91700 genunix:i_ddi_attachchild+34 (60020032d60, 2a102f91ca0, ffffffff, ffffffffffffffff, 8, 0)
  %l0-3: 0000000000000000 0000000000000000 0000000000010000 00000300016144f0
  %l4-7: 0000000000000000 000000007013c0a8 0000060020032d60 0000000000000000
000002a102f917b0 genunix:devi_attach_node+b0 (60020032d60, 4004048, 18a9400, 60020032dc8, 18a9400, 20000)
  %l0-3: 0000000000000000 0000000000000000 0000000000010000 00000300016144f0
  %l4-7: 0000000000000000 000000007013c0a8 0000060020032d60 0000000000000000
000002a102f91860 genunix:config_immediate_children+c8 (30001614488, 4004048, ffffffff, ffffffffffffffff, 8, 7013c0a8)
  %l0-3: 0000000000000000 0000000000000000 0000000000010000 00000300016144f0
  %l4-7: 0000000000000000 000000007013c0a8 0000060020032d60 0000000000000000
000002a102f91920 genunix:devi_config_common+b0 (30001614488, 4004048, ffffffff, 4, 0, ffffffffffffffff)
  %l0-3: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  %l4-7: 0000000000000000 000000007013c0a8 00000000700723f0 0000000000000009
000002a102f919d0 genunix:mt_config_thread+58 (600253b0288, 0, 1857cc0, 1857cc0, 600202e26c0, 30001614488)
  %l0-3: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  %l4-7: 0000000000000000 0000000000000000 0000000000000000 0000000000000000

syncing file systems... 1 done

一看就知道是异常地址访问,越界了,

之前一直运行的很好,刚刚运行Q的这个版本就panic,一定是它的问题了,

但是以此做结论是不太令人信服的,没办法,还是须要自己动手找到根源咯,

既然系统可以稳定的于同一位置panic,采用kmdb进行单步调试应该是不错的选择,

而且不知道什么原因,core file始终没有dump到硬盘上,不管了还是kmdb吧!

Solaris为了方便用户在系统启动的时候调试内核,让kmdb在系统boot的很早很早的阶段就能够加载,

具体有多早呢,原话是这么说的before control has passed from the kernel runtime linker(krtld) to kernel

当然系统默认启动是不加载kmdb的,你可以运行boot -k(对了我的系统是sparc)来让系统在启动的时候自动加载kmdb

{0} ok boot -k

SC Alert: Host System has Reset

...

Loading kmdb...
SunOS Release 5.11 Version snv_79 64-bit
Copyright 1983-2007 Sun Microsystems, Inc.  All rights reserved.
Use is subject to license terms.

...

系统开始启动,这个时候你就可以开始设置断点了

按住Control + ]

telnet> send brk

[16]> ql_init_pkt:p

[16]> :c

系统是在ql_init_pkt+0x24 panic的,为了保险起见,将断点设置在函数入口,然后:c继续执行

t200b console login: WARNING: fp(1): fp_attach_handler start
Loaded modules: [ mdesc zfs ssd nfs random ]
kmdb: stop at qlc`ql_init_pkt
kmdb: target stopped at:
qlc`ql_init_pkt:save      %sp, -0xb0, %sp
[24]>

现在我们可以单步跟踪,看看到底发生了什么,

[24]> :s
kmdb: target stopped at:
qlc`ql_init_pkt+4:      call      +0xc07c       <qlc`ql_fca_handle_to_state>
...

[20]> :s
kmdb: target stopped at:
qlc`ql_fca_handle_to_state+4:   sethi     %hi(0x7b641000), %i5
[20]> :u

因为问题在ql_init+pkt,所以我们可以:u越过中间调用函数

[20]> :s
kmdb: target stopped at:
qlc`ql_init_pkt+0x68:   ldx       [%i1 + 0xd8], %i2

好的,现在到了谜底揭晓的时候了,原来是i1寄存器里的数据出现了问题,

sparc采用registers window来传递参数,ix是输入寄存器,

i0存第一个参数,i1第二个...

看ql_init_pkt的函数定义, 第二个参数是fc_packet_t

[20]> ::cpustack
qlc`ql_init_pkt+0x68(6002280e000, 60028277c00, 0, 0, 0, 70135000)
fctl`fc_ulp_init_packet+0x18(6002743a000, 60028277c00, 0, 0, 7b602ed4,
600202cf680)
fp`fp_alloc_pkt+0x60(6002743a000, 74, 74, 0, 0, 60028277bd8)
fp`fp_attach_handler+0x4a8(60028273c90, 1, 7b787000, 6002743a0b8, 6002743a0ba,
6002743a000)
devi_attach+0xa0(60028273c90, 0, 7ffffc00, ffffffffffffffff, ffffffffffffffff,
7b772150)
attach_node+0x98(60028273c90, 1, 0, 60028273cf8, 6f8, 300039e4000)
i_ndi_config_node+0x104(60028273c90, 6, 10f12e0, 10, 0, 1899400)
i_ddi_attachchild+0x34(60028273c90, 30003bf3580, 0, 0, 0, 0)
devi_attach_node+0xb0(60028273c90, 4000, 2a102dd5663, 60028273cf8, 3, 20000)
devi_config_one+0x2c0(3000164e488, 0, 0, 0, 0, 4000)
ndi_devi_config_one+0xb0(3000164e488, 2a102dd5660, 2a102dd5300, 4000, 0,
ffffffffffffffff)
devfs`dv_find+0x1e4(6002775a490, 2a102dd5660, 2a102dd5658, 2a102dd5940,
60020083940, 60020004b88)
devfs`devfs_lookup+0x1c(600277a56c0, 2a102dd5660, 2a102dd5658, 2a102dd5940, 0,
60020083940)
fop_lookup+0x108(600277a56c0, 2a102dd5660, 2a102dd5658, 2a102dd5940, 0,
60020083940)
lookuppnvp+0x380(2a102dd5940, 0, 1, 600277a56c0, 18edc58, 60020083940)
lookuppnat+0x10c(60020083940, 0, 1, 0, 2a102dd5ad8, 0)
[20]> 60028277c00::print fc_packet_t 
{
    pkt_tran_flags = 0xbadd
    pkt_tran_type = 0xcafe
    pkt_timeout = 0xbaddcafe
    pkt_cmdlen = 0xbaddcafe
    pkt_rsplen = 0xbaddcafe
    pkt_datalen = 0
    pkt_cmd = 0xbaddcafebaddcafe
    pkt_resp = 0xbaddcafebaddcafe
    pkt_data = 0
    pkt_data_buf = 0xbaddcafebaddcafe
    pkt_ulp_comp = 0xbaddcafebaddcafe
    pkt_ulp_private = 0xbaddcafebaddcafe
    pkt_comp = 0xbaddcafebaddcafe
    pkt_pd = 0
    pkt_cmd_dma = 0x6002826c040
    pkt_cmd_acc = 0
    pkt_cmd_cookie = 0
    pkt_resp_dma = 0x60028279e00
    pkt_resp_acc = 0
    pkt_resp_cookie = 0
    pkt_data_dma = 0xbaddcafebaddcafe
    pkt_data_acc = 0xbaddcafebaddcafe

(::cpustack可以看到当前运行thread的堆栈信息,还有::cpuregs可以获取当前各寄存器的信息)

原来fc_packet_t结构定义做了更新,而Q还是采用旧的定义,

当然会出问题咯!

最后有兄弟会问,其实看堆栈然后::dis一下不就知道了,

呵呵,也是,

不过kmdb单步跟踪在很多时候还是须要,以此作为练习了!

 

评论:

再写详细些偶。灰常有用。

发表于 David 在 2008年04月22日, 02:13 下午 CST #

发表一条评论:
  • HTML语法: 禁用

今日点击: 80