kernel crashdump

25

Upload: adrien-mahieux

Post on 07-Jan-2017

861 views

Category:

Technology


0 download

TRANSCRIPT

virsh dump MyGuestName /storage/MyGuestName.dump

dump-guest-memory [-z|-l|-s] FILENAME

dump-core domain-id filename

/sys/kernel/kexec_crash_{loaded,size}

echo c > /proc/sysrq-trigger ipmitool power diagvirsh inject-nmi MyGuestName

- kernel.unknown_nmi_panic=1

nmi_watchdog=1sysctl kernel.softlockup_panic=1sysctl vm.panic_on_oom=1

{SOFT,HARD}LOCKUP_DETECTOR / BOOTPARAM_{SOFT,HARD}

LOCKUP_PANIC

{SOFT,CLOCKSOURCE}_WATCHDOG

debuginfo-install kernel

apt-get install linux-image-$(uname -r)-dbg

KERNEL: /var/crash/127.0.0.1-2015-08-20-20:00:00/vmcore DUMPFILE: vmcore.myserver [PARTIAL DUMP] CPUS: 24 DATE: Mon Aug 20 20:00:00 2015 UPTIME: 32 days, 17:12:02LOAD AVERAGE: 1625.88, 1603.11, 1509.73 TASKS: 25639 NODENAME: myserver RELEASE: 2.6.18-371.8.1.el5 VERSION: #1 SMB Fri Mar 28 05:53:58 EDT 2014 MACHINE: x86_64 (2933Mhz) MEMORY: 284 GB PANIC: “Kernel panic - not syncing: An NMI occured” PID: 61015 COMMAND: "java" TAKS: ffff8135b50e5830 [THREAD_INFO: ffff8104bd256000] CPU: 0 STATE: TASK_RUNNING (PANIC)

Let’s check for a real kernel bug

KERNEL: /usr/lib/debug/lib/modules/2.6.32-431.29.2.el6.x86_64/vmlinux DUMPFILE: vmcore [PARTIAL DUMP] CPUS: 64 DATE: Wed Jun 14 11:23:14 2015 UPTIME: 44 days, 04:14:21LOAD AVERAGE: 0.70, 0.58, 0.55 TASKS: 1917 NODENAME: myredhat65 RELEASE: 2.6.32-431.29.2.el6.x86_64 VERSION: #1 SMP Sun Jul 27 15:55:46 EDT 2014 MACHINE: x86_64 (1997 Mhz) MEMORY: 64 GB PANIC: "BUG: unable to handle kernel NULL pointer dereference at (null)" PID: 2120 COMMAND: "scsi_eh_6" TASK: ffff880437dcf540 [THREAD_INFO: ffff880435a94000] CPU: 50 STATE: TASK_RUNNING (PANIC)

crash> btPID: 2120 TASK: ffff880437dcf540 CPU: 50 COMMAND: "scsi_eh_6"#0 [ffff880435a95890] machine_kexec at ffffffff81038f3b#1 [ffff880435a958f0] crash_kexec at ffffffff810c5af2#2 [ffff880435a959c0] oops_end at ffffffff8152ca50#3 [ffff880435a959f0] no_context at ffffffff8104a00b#4 [ffff880435a95a40] __bad_area_nosemaphore at ffffffff8104a295#5 [ffff880435a95a90] bad_area_nosemaphore at ffffffff8104a363#6 [ffff880435a95aa0] __do_page_fault at ffffffff8104aabf#7 [ffff880435a95bc0] do_page_fault at ffffffff8152e99e#8 [ffff880435a95bf0] page_fault at ffffffff8152bd55 [exception RIP: scsi_send_eh_cmnd+99] RIP: ffffffff813860e3 RSP: ffff880435a95ca0 RFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff880c2d600ec0 RCX: 0000000000002710 RDX: ffff880c3002f000 RSI: ffffffff82017288 RDI: ffff880c2d600ec0 RBP: ffff880435a95da0 R8: 0000000000000000 R9: 0000000000000000 R10: 000d8f6a631f7b23 R11: 0000000000000001 R12: 0000000000000001 R13: ffff880435a95e90 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018#9 [ffff880435a95da8] scsi_eh_tur at ffffffff81386672#10 [ffff880435a95dd8] scsi_eh_test_devices at ffffffff8138675a#11 [ffff880435a95e28] scsi_error_handler at ffffffff81387d4c#12 [ffff880435a95ee8] kthread at ffffffff8109abf6#13 [ffff880435a95f48] kernel_thread at ffffffff8100c20a

crash> gdb set disassemble-flavor intelcrash> dis scsi_send_eh_cmnd0xffffffff81386080 <scsi_send_eh_cmnd>: push rbp0xffffffff81386081 <scsi_send_eh_cmnd+1>: mov rbp,rsp0xffffffff81386084 <scsi_send_eh_cmnd+4>: push r150xffffffff81386086 <scsi_send_eh_cmnd+6>: push r140xffffffff81386088 <scsi_send_eh_cmnd+8>: push r130xffffffff8138608a <scsi_send_eh_cmnd+10>: push r120xffffffff8138608c <scsi_send_eh_cmnd+12>: push rbx0xffffffff8138608d <scsi_send_eh_cmnd+13>: sub rsp,0xd80xffffffff81386094 <scsi_send_eh_cmnd+20>: nop DWORD PTR [rax+rax*1+0x0]0xffffffff81386099 <scsi_send_eh_cmnd+25>: mov rax,QWORD PTR gs:0x280xffffffff813860a2 <scsi_send_eh_cmnd+34>: mov QWORD PTR [rbp-0x38],rax0xffffffff813860a6 <scsi_send_eh_cmnd+38>: xor eax,eax0xffffffff813860a8 <scsi_send_eh_cmnd+40>: mov QWORD PTR [rbp-0xc8],rsi0xffffffff813860af <scsi_send_eh_cmnd+47>: mov DWORD PTR [rbp-0xcc],edx0xffffffff813860b5 <scsi_send_eh_cmnd+53>: mov rbx,rdi0xffffffff813860b8 <scsi_send_eh_cmnd+56>: mov rax,QWORD PTR [rdi+0x80]

crash> rd -o 0x80 0xffff880c2d600ec0ffff880c2d600f40: ffff880c372afd00

0xffffffff813860bf <scsi_send_eh_cmnd+63>: mov rdx,QWORD PTR [rdi]0xffffffff813860c2 <scsi_send_eh_cmnd+66>: mov r14d,r8d0xffffffff813860c5 <scsi_send_eh_cmnd+69>: mov rax,QWORD PTR [rax+0xb0]

crash> rd -64 -o 0xb0 ffff880c372afd00ffff880c372afdb0: ffff880c2a4d0400

0xffffffff813860cc <scsi_send_eh_cmnd+76>: mov QWORD PTR [rbp-0xe8],0x00xffffffff813860d7 <scsi_send_eh_cmnd+87>: test rax,rax0xffffffff813860da <scsi_send_eh_cmnd+90>: je 0xffffffff813860ed <scsi_send_eh_cmnd+109>0xffffffff813860dc <scsi_send_eh_cmnd+92>: mov rax,QWORD PTR [rax+0x2c8]

crash> rd -64 -o 0x2c8 ffff880c2a4d0400ffff880c2a4d06c8: 0000000000000000

0xffffffff813860e3 <scsi_send_eh_cmnd+99>: mov rax,QWORD PTR [rax]

static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) { if (!cmd->request->rq_disk) return NULL; return *(struct scsi_driver **)cmd->request->rq_disk->private_data;}

if (!cmd->request->rq_disk)

crash> struct scsi_cmndstruct scsi_cmnd { … unsigned int transfersize; __struct request *request;__ unsigned char *sense_buffer; …}

crash> struct requeststruct request { … struct gendisk *rq_disk; …}

crash> struct -xo gendiskstruct gendisk { … [0x2c0] struct request_queue *queue; [0x2c8] void *private_data; [0x2d0] int flags; …}

ffff880c2d600ec0 = scsi_cmndffff880c372afd00 = requestffff880c2a4d0400 = gendisk

0xffffffff813860dc <scsi_send_eh_cmnd+92>: mov rax,QWORD PTR [rax+0x2c8]

crash> struct gendisk.disk_name ffff880c2a4d0400 disk_name = "sg96\000\000\000\000...\000"

addr : 0xffff880c3002f000

crash> scsi_device.vendor 0xffff880c3002f000 vendor = 0xffff880c2a3a2ac8 "QUANTUM Scalar i6000 656Q656Q.GS01501 \001"

crash> scsi_device.model 0xffff880c3002f000 model = 0xffff880c2a3a2ad0 "Scalar i6000 656Q656Q.GS01501 \001"

crash> scsi_device.rev 0xffff880c3002f000 rev = 0xffff880c2a3a2ae0 "656Q656Q.GS01501 \001"

/* Return-probe handler: force return value to be 1. */static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs){#if defined(__i386__) && !defined(__KERNEL__) regs->eax = 1;#else regs->ax = 1;#endif return 0;}

crash> net NET_DEVICE NAME IP ADDRESS(ES)ffff88003e999020 lo 127.0.0.1ffff88003e228020 eth0 192.168.122.13

crash> struct net_device.mtu ffff88003e228020 mtu = 1500

crash> struct -o net_device.mtu ffff88003e228020struct net_device { [ffff88003e22818c] unsigned int mtu;}

crash> rd -32 -D ffff88003e22818cffff88003e22818c: 1500

crash> wr -32 ffff88003e22818c 1400

[root@centos6 ~]# ifconfig eth0 |grep -Po 'MTU:[0-9]+'MTU:1400