千金散尽还复来,莫使金樽空对月
System Hang
今天在sparc机器上snv版本碰到了一个system hang.
当时正在做NPIV project的测试,跑了一个shell脚本,
基本就是不断的create和delete virtual HBA port,
突然有一次"cfgadm -al"没有反应了,
于是我尝试着rlogin到这个机器,但是rlogin也没有反应了,
还好,我保留了一个到这台机器的rlogin session,
这样我们终于能够知道这台机器在搞什么鬼!
mdb -k
...
>::threadlist -v
//我们可以看到当前所有正在运行的进程
ADDR PROC LWP CLS PRI WCHAN
....
000003000ccbe680 30005006458 30007f0cbc8 1 59 1832838
PC: cv_wait+0x3c CMD: cfgadm -al
stack pointer for thread 3000ccbe680: 2a1003ae5d1
[ 000002a1003ae5d1 cv_wait+0x3c() ]
kern_preprom+0x128()
prom_getproplen+0x30()
impl_ddi_bus_prop_op+0x48()
ddi_bus_prop_op+0x28()
ddi_prop_search_common+0x1b8()
ddi_prop_exists+0x30()
px_prop_op+0x18()
di_getprop+0x2cc()
di_copynode+0x440()
di_copytree+0xac()
di_snapshot+0x180()
di_snapshot_and_clean+0xc()
di_ioctl+0x5f4()
fop_ioctl+0x48()
ioctl+0x164()
syscall_trap32+0xcc()
....
> 1832838::wchaninfo
ADDR TYPE NWAITERS
0000000001832838 cond 6
//现在我们看到这是一个condition变量,而且有六个thread在等待此cv signal
>1832838::wchan | ::threadlist -v
//mdb command wchan 可以帮助我们找到等待此cv signal的所有进程
//我们采用了管道, 这样我们在找到进程之后就将所有的进程stack全部打印出来了
ADDR PROC LWP CLS PRI WCHAN
000002a1014f7ca0 1858e40 0 0 60 1832838
PC: cv_wait+0x3c THREAD: config_client_paths_thread()
stack pointer for thread 2a1014f7ca0: 2a1014f63e1
[ 000002a1014f63e1 cv_wait+0x3c() ]
kern_preprom+0x128()
prom_getproplen+0x30()
impl_ddi_bus_prop_op+0x48()
ddi_bus_prop_op+0x28()
ddi_prop_search_common+0x1b8()
ddi_prop_op+0x5c()
fctl_initchild+0x34()
init_node+0x60()
i_ndi_config_node+0xa4()
ddi_initchild+0x1c()
find_sibling+0xd8()
devi_config_one+0x168()
ndi_devi_config_one+0xb0()
resolve_pathname+0x148()
e_ddi_hold_devi_by_path+0x10()
bus_config_one_phci_child+0x1c()
config_client_paths_sync+0x10()
config_client_paths_thread+0x16c()
thread_start+4()
000002a101447ca0 1858e40 0 0 60 1832838
PC: cv_wait+0x3c THREAD: config_client_paths_thread()
stack pointer for thread 2a101447ca0: 2a101446541
[ 000002a101446541 cv_wait+0x3c() ]
kern_preprom+0x128()
prom_nextnode+0x24()
finddevice+0x12c()
promif_finddevice+4()
kern_cif_handler+0x24()
client_handler+0x2c()
prom_finddevice+0x90()
path_to_major+0x58()
child_path_to_driver+0x6c()
devi_config_one+0x88()
ndi_devi_config_one+0xb0()
resolve_pathname+0x148()
e_ddi_hold_devi_by_path+0x10()
bus_config_one_phci_child+0x1c()
config_client_paths_sync+0x10()
config_client_paths_thread+0x16c()
thread_start+4()
000002a1014dfca0 1858e40 0 0 60 1832838
PC: cv_wait+0x3c THREAD: config_client_paths_thread()
stack pointer for thread 2a1014dfca0: 2a1014de961
[ 000002a1014de961 cv_wait+0x3c() ]
kern_preprom+0x128()
prom_finddevice+0x64()
path_to_major+0x58()
child_path_to_driver+0x6c()
devi_config_one+0x88()
ndi_devi_config_one+0xb0()
resolve_pathname+0x148()
e_ddi_hold_devi_by_path+0x10()
bus_config_one_phci_child+0x1c()
config_client_paths_sync+0x10()
config_client_paths_thread+0x16c()
thread_start+4()
000003000ccbe680 30005006458 30007f0cbc8 1 59 1832838
PC: cv_wait+0x3c CMD: cfgadm -al
stack pointer for thread 3000ccbe680: 2a1003ae5d1
[ 000002a1003ae5d1 cv_wait+0x3c() ]
kern_preprom+0x128()
prom_getproplen+0x30()
impl_ddi_bus_prop_op+0x48()
ddi_bus_prop_op+0x28()
ddi_prop_search_common+0x1b8()
ddi_prop_exists+0x30()
px_prop_op+0x18()
di_getprop+0x2cc()
di_copynode+0x440()
di_copytree+0xac()
di_snapshot+0x180()
di_snapshot_and_clean+0xc()
di_ioctl+0x5f4()
fop_ioctl+0x48()
ioctl+0x164()
syscall_trap32+0xcc()
00000300052f46e0 30005007068 300053faee0 1 59 1832838
PC: cv_wait+0x3c CMD: devfsadmd
stack pointer for thread 300052f46e0: 2a1015f65f1
[ 000002a1015f65f1 cv_wait+0x3c() ]
kern_preprom+0x128()
prom_finddevice+0x64()
path_to_major+0x58()
child_path_to_driver+0x6c()
devi_config_one+0x88()
mdi_vhci_bus_config+0x128()
vhci_scsi_bus_config+0x40()
ndi_devi_config_one+0x8c()
resolve_pathname+0x148()
e_ddi_hold_devi_by_path+0x10()
di_snapshot+0xb8()
di_snapshot_and_clean+0xc()
di_ioctl+0x5f4()
fop_ioctl+0x48()
ioctl+0x164()
syscall_trap32+0xcc()
0000030004ba06a0 3000c36e478 3000c95cf18 1 59 1832838
PC: cv_wait+0x3c CMD: /usr/sbin/in.rlogind
stack pointer for thread 30004ba06a0: 2a100466501
[ 000002a100466501 cv_wait+0x3c() ]
kern_preprom+0x128()
prom_getproplen+0x30()
impl_ddi_bus_prop_op+0x48()
ddi_bus_prop_op+0x28()
ddi_prop_search_common+0x218()
ddi_prop_op+0x5c()
ddi_getlongprop+0x1c()
ptemopen+0x110()
qattach+0x11c()
strioctl+0x1884()
spec_ioctl+0x80()
fop_ioctl+0x48()
ioctl+0x164()
syscall_trap32+0xcc()
好的我们现在需要知道这些进程等待的cv signal到底应该由谁释放?? 我们现在可以check一下代码了.
Posted at 11:28下午 十一月 26, 2007 by Shu-Jun Allan Ou in 程序员 | 评论[2]
今日点击: 5
Hi, 建议你在preferences中选用rich text editor,重新编辑一下。enter将会用<p></p>来分段,shift+enter可以输入<br />。另外,用<code></code>可以方便的显示代码或终端输出。
发表于 Yong Sun 在 2007年11月27日, 11:37 上午 CST #
十分感谢!
之前的确实够Ugly :)
发表于 Allan 在 2007年11月27日, 09:12 下午 CST #