Commit 8e1c52ea by liyapeng

增加了判断是否存在D状态进程,以不同方式执行nvidia-bug-report.sh来避免卡死;另外增加了搜集nvrm信息

1 parent 400b596a
Showing with 7 additions and 1 deletions
......@@ -66,13 +66,19 @@ GPULogCollect() {
if [ -f /usr/bin/nvidia-bug-report.sh ] && [ $gpuInstanceState -gt 0 ]; then
mkdir -p ${LOG_FILE_PATH}/GPULogCollect
echo "Start to collect gpu log for instance $(hostname) by nvidia-bug-report.sh"
nvidia-bug-report.sh --safe-mode --output-file ${LOG_FILE_PATH}/GPULogCollect/nvidia-bug-report.log.gz.gz
d_processes=$(ps aux | awk '$8=="D" {print}')
if [ -n "$d_processes" ]; then
nvidia-bug-report.sh --safe-mode --extra-system-data --output-file ${LOG_FILE_PATH}/GPULogCollect/nvidia-bug-report.log.gz
else
nvidia-bug-report.sh --safe-mode --output-file ${LOG_FILE_PATH}/GPULogCollect/nvidia-bug-report.log.gz
fi
timeout 30 nvidia-smi >${LOG_FILE_PATH}/GPULogCollect/nvidia-smi.log
nvidia-smi topo -m >>${LOG_FILE_PATH}/GPULogCollect/nvidia-smi.log
lspci -d 10de: | egrep "VGA|3D" >${LOG_FILE_PATH}/GPULogCollect/lspci-nvidia.log
lspci -vvv -t >>${LOG_FILE_PATH}/GPULogCollect/lspci-nvidia.log
lspci -vvv >>${LOG_FILE_PATH}/GPULogCollect/lspci-nvidia.log
dmesg -T >${LOG_FILE_PATH}/GPULogCollect/dmesg-gpu.log
journalctl | grep -i nvrm > journalctl_nvrm.txt
# get slot info
touch ${LOG_FILE_PATH}/GPULogCollect/slot-info.txt
nvidia-smi --query-gpu=index,gpu_name,gpu_bus_id,uuid --format=csv > ${LOG_FILE_PATH}/GPULogCollect/slot-info.txt
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!